In [1]:
import numpy as np 
import pandas as pd
pd.options.mode.chained_assignment = None
import os 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer       
from sklearn.model_selection import train_test_split                
from sklearn.linear_model import LogisticRegression                 
from sklearn.metrics import accuracy_score                           
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder 
import re 
import nltk
from nltk import word_tokenize
nltk.download('stopwords')

/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv
/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv
/kaggle/input/processed-twitter-comment-2/07_14_comment.csv
/kaggle/input/processed-twitter-comment-2/06_30_comment.csv
/kaggle/input/processed-twitter-comment-2/06_14_comment.csv
/kaggle/input/processed-twitter-comment-2/06_22_comment.csv
/kaggle/input/processed-twitter-comment-2/07_11_comment.csv
/kaggle/input/processed-twitter-comment-2/06_16_comment.csv
/kaggle/input/processed-twitter-comment-2/06_17_comment.csv
/kaggle/input/processed-twitter-comment-2/07_02_comment.csv
/kaggle/input/processed-twitter-comment-2/06_26_comment.csv
/kaggle/input/processed-twitter-comment-2/07_05_comment.csv
/kaggle/input/processed-twitter-comment-2/07_07_comment.csv
/kaggle/input/processed-twitter-comment-2/07_13_comment.csv
/kaggle/input/processed-twitter-comment-2/06_29_comment.csv
/kaggle/input/processed-twitter-comment-2/07_03_comment.csv
/kaggle/input/proces



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

This part is to do the sentiment analysis to the Twitter comment data.

In [2]:
#import train, test, validation dataset to dataframe

#Validation dataset
val=pd.read_csv("./twitter-entity-sentiment-analysis/twitter_validation.csv", header=None)
#Train dataset, test dataset
train=pd.read_csv("./twitter-entity-sentiment-analysis/twitter_training.csv", header=None)

In [3]:
# set the dataframe column head names
train.columns=['id','information','type','text']
val.columns=['id','information','type','text']

In [4]:
train_data=train
train_data

Unnamed: 0,id,information,type,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
val_data=val
val_data

Unnamed: 0,id,information,type,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


Train/Test/Validation dataset preprocessing

In [6]:
# Train dataset preprocessing
train_data["lower"] = train_data.text.str.lower()                                             #lowercase
train_data["lower"] = [str(data) for data in train_data.lower]                                #converting all to string
train_data["lower"] = train_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))      # delete all special characters

# Test datset preprocessing
val_data["lower"] = val_data.text.str.lower()                                                 #lowercase
val_data["lower"] = [str(data) for data in val_data.lower]                                    #converting all to string
val_data["lower"] = val_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))          #regex

In [7]:
#Tokenization
tokens_text = [word_tokenize(str(word)) for word in train_data.lower]

#Unique word counter
tokens_counter = [item for sublist in tokens_text for item in sublist]
print("Number of tokens: ", len(set(tokens_counter)))

Number of tokens:  30436


In [8]:
#Choosing english stopwords
stopwords_nltk = nltk.corpus.stopwords
stop_words = stopwords_nltk.words('english')
#stop_words

Build the Sentiment analysis logic regression model: using the Bag of words method, and N-grams algorithm to ensure multiple words combination classify. I built one gram, two grams and three grams model to compare.

1. One gram model

In [9]:
#One word n-gram
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=stop_words,           #English Stopwords
    ngram_range=(1, 1)              #analysis of one word
)

In [10]:
#Train - Test splitting
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=0)

In [None]:
#Creation of encoding related to train dataset
X_train_bow = bow_counts.fit_transform(reviews_train.lower)
#Transformation of test dataset with train encoding
X_test_bow = bow_counts.transform(reviews_test.lower)

In [12]:
X_test_bow

<14937x28993 sparse matrix of type '<class 'numpy.int64'>'
	with 161222 stored elements in Compressed Sparse Row format>

In [13]:
#Labels for train and test encoding
y_train_bow = reviews_train['type']
y_test_bow = reviews_test['type']

In [14]:
#Total of registers per category
y_test_bow.value_counts() / y_test_bow.shape[0]

Negative      0.299190
Positive      0.282252
Neutral       0.245632
Irrelevant    0.172926
Name: type, dtype: float64

In [15]:
# Logistic regression
model1 = LogisticRegression(C=1, solver="liblinear",max_iter=1000)
model1.fit(X_train_bow, y_train_bow)
# Prediction
test_pred = model1.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, test_pred) * 100)

Accuracy:  83.65803039432282


In [16]:
#Validation data
X_val_bow = bow_counts.transform(val_data.lower)
y_val_bow = val_data['type']

In [17]:
Val_res = model1.predict(X_val_bow)
print("Accuracy: ", accuracy_score(y_val_bow, Val_res) * 100)

Accuracy:  93.4


2. Two gram model

In [18]:
#2 words n-gram
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1,2)
)
#Data labeling
X_train_bow = bow_counts.fit_transform(reviews_train.lower)
X_test_bow = bow_counts.transform(reviews_test.lower)
X_val_bow = bow_counts.transform(val_data.lower)



In [19]:
model2 = LogisticRegression(C=0.9, solver="liblinear",max_iter=1500)
# Logistic regression
model2.fit(X_train_bow, y_train_bow)
# Prediction
test_pred_2 = model2.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_2) * 100)

Accuracy:  91.22313717613979


In [20]:
y_val_bow = val_data['type']
Val_pred_2 = model2.predict(X_val_bow)
print("Accuracy: ", accuracy_score(y_val_bow, Val_pred_2) * 100)

Accuracy:  98.1


3. Four gram model

In [21]:
#4 words n-gram
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1,4)
)
#Data labeling
X_train_bow = bow_counts.fit_transform(reviews_train.lower)
X_test_bow = bow_counts.transform(reviews_test.lower)
X_val_bow = bow_counts.transform(val_data.lower)



In [22]:
model3 = LogisticRegression(C=0.9, solver="liblinear",max_iter=1500)
# Logistic regression
model3.fit(X_train_bow, y_train_bow)
# Prediction
test_pred_3 = model3.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_3) * 100)

Accuracy:  90.79467095132891


In [23]:
y_val_bow = val_data['type']
Val_pred_3 = model3.predict(X_val_bow)
print("Accuracy: ", accuracy_score(y_val_bow, Val_pred_3) * 100)

Accuracy:  98.6


Comparing the average accuracy, one gram: (83.66+93.4)/2 = 88.53, two gram: (91.22+98.1)/2 = 94.66, four gram: (90.79+98.6)/2= 94.70, 94.70 >94.66>88.53, thus four gram BoW model performs best.

build the sentiment count data frame

In [24]:
#sentiment_count_df = pd.read_csv('/kaggle/working/sentiment_count.csv')
sentiment_count_df = pd.DataFrame({'date':[],'Positive':[],'Neutral':[],'Negative':[],'Irrelevant':[]})
#sentiment_count_df.columns = ['date','Positive','Neutral','Negative','Irrelevant']
print(sentiment_count_df)

Empty DataFrame
Columns: [date, Positive, Neutral, Negative, Irrelevant]
Index: []


Check point: Use the four gram model to do the sentiment analysis

In [25]:
# The user needs to manully change the String date to do the sentiement analysis one by one
date = '06_' + '25'
filename = date + '_comment.csv'
fileaddress = './processed-twitter-comment-2/' + str(filename)
comment_df = pd.read_csv(fileaddress)
print(comment_df)
#print(comment_df)



### The user needs to manually pick up the command with different text because the different columns format of comment CSV files! ###


#comment_df.drop(['withheld'], axis = 1,inplace=True)
comment_df.columns = ['Unnamed: 0','author_id','text','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0','text','author_id','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0','Unnamed: 0.1','author_id','text','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0','Unnamed: 0.1','text','author_id','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0','Unnamed: 0.1','author_id','text','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0','author_id','text','withheld','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0.1','Unnamed: 0','author_id','text','username','retweet','reply','like','quote','bookmark']
#comment_df.columns = ['Unnamed: 0.1','Unnamed: 0','text','author_id','username','retweet','reply','like','quote','bookmark']

#comment_df.drop(['Unnamed: 0'], axis = 1,inplace=True)
#comment_df.drop(['Unnamed: 0.1'], axis = 1,inplace=True)

# extra stopwords
interrupt_word_list = ['rt','http','https']

# Preprocessing
#print(comment_df)
comment_df["lower"]=comment_df.text.str.lower() #lowercase
comment_df["lower"]=[str(data) for data in comment_df.lower] #converting all to string
comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #remove all punctuations

# remove all extra useless words
#comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub('rt', ' ', x))             
#comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub('http', ' ', x)) 
#comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub('https', ' ', x)) 
#comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub('[a-z]+', ' ', x))  
for wrds in interrupt_word_list:
    comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub(wrds, ' ', x))

#comment_df["lower"]=comment_df.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex
print(comment_df)

    Unnamed: 0            author_id  \
0            0   772012789883809794   
1            1  1637044857867993090   
2            2   948001724387782657   
3            3  1624236141103796226   
4            4  1237432588081541120   
..         ...                  ...   
88          88  1639066415645995008   
89          89           3952072093   
90          90            269271627   
91          91  1552318939144851460   
92          92  1643722152066134018   

                                                 text         username  \
0   @TripleStrucK @gedjesis @YeetingNoodle -plug i...     TripleStrucK   
1   RT @artimaaaeus: ghost and cpt mactavish but i...      artimaaaeus   
2   RT @loneghostwolf88: 4th entry of the beach bo...  loneghostwolf88   
3   RT @AmikoRoyAi: Caught flirting on a mission 😮...       AmikoRoyAi   
4   🔴 LIVE - CALL OF DUTY: MODERN WARFARE II Seaso...          YouTube   
..                                                ...              ...   
88  RT @ozyma

Using the model to predict

In [26]:
#sentiment analysis using 4-words ngram
X_comment_bow = bow_counts.transform(comment_df.lower)
comment_pred_3 = model3.predict(X_comment_bow)
print(comment_pred_3)

['Irrelevant' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Positive'
 'Neutral' 'Negative' 'Neutral' 'Neutral' 'Neutral' 'Positive' 'Positive'
 'Neutral' 'Neutral' 'Neutral' 'Irrelevant' 'Irrelevant' 'Negative'
 'Neutral' 'Neutral' 'Positive' 'Neutral' 'Irrelevant' 'Neutral' 'Neutral'
 'Neutral' 'Negative' 'Neutral' 'Neutral' 'Negative' 'Neutral' 'Negative'
 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Negative' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Negative' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Negative' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Positive'
 'Neutral' 'Neutral' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Negative' 'Neutral' 'Neutral' 'Negative'
 'Neutral' 'Neutral' 'Negative']


Create the sentiment count dictionary

In [27]:
sent_dict = {'date': date, 'Positive': 0,'Neutral': 0,'Negative': 0,'Irrelevant': 0}
for sent in comment_pred_3:
    if sent == 'Positive':
        sent_dict['Positive'] = sent_dict['Positive'] + 1
    elif sent == 'Neutral':
        sent_dict['Neutral'] = sent_dict['Neutral'] + 1
    elif sent == 'Negative':
        sent_dict['Negative'] = sent_dict['Negative'] + 1
    else:
        sent_dict['Irrelevant'] = sent_dict['Irrelevant'] + 1

print(sent_dict)
print(comment_pred_3)

{'date': '06_25', 'Positive': 9, 'Neutral': 69, 'Negative': 11, 'Irrelevant': 4}
['Irrelevant' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Positive'
 'Neutral' 'Negative' 'Neutral' 'Neutral' 'Neutral' 'Positive' 'Positive'
 'Neutral' 'Neutral' 'Neutral' 'Irrelevant' 'Irrelevant' 'Negative'
 'Neutral' 'Neutral' 'Positive' 'Neutral' 'Irrelevant' 'Neutral' 'Neutral'
 'Neutral' 'Negative' 'Neutral' 'Neutral' 'Negative' 'Neutral' 'Negative'
 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Negative' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Negative' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Negative' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Positive'
 'Neutral' 'Neutral' 'Neutral' 'Positive' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral' 'Negative' 'Neutr

append the sentiment count result that day to dataframes

In [28]:
# append sentiment result to 
df_newdate = pd.DataFrame({'date': [sent_dict['date']], 'Positive': [sent_dict['Positive']], 'Neutral': [sent_dict['Neutral']], 'Negative': [sent_dict['Negative']], 'Irrelevant': [sent_dict['Irrelevant']]})
sentiment_count_df = sentiment_count_df.append(df_newdate)
print(sentiment_count_df)

    date  Positive  Neutral  Negative  Irrelevant
0  06_25       9.0     69.0      11.0         4.0


  sentiment_count_df = sentiment_count_df.append(df_newdate)


Sentiment count update for each day, if not all dates comment analysis finished, return to the checkpoint.
After all dates sentiment analysis finished, append the sentiment results to dataframe.

In [29]:
# append sentiment result to comments
comment_df['sentiment'] = comment_pred_3

Save the results to CSV files

In [30]:
# save sentiment results
print(comment_df)
save_filename = date + '_comment_sa.csv'         

    Unnamed: 0            author_id  \
0            0   772012789883809794   
1            1  1637044857867993090   
2            2   948001724387782657   
3            3  1624236141103796226   
4            4  1237432588081541120   
..         ...                  ...   
88          88  1639066415645995008   
89          89           3952072093   
90          90            269271627   
91          91  1552318939144851460   
92          92  1643722152066134018   

                                                 text         username  \
0   @TripleStrucK @gedjesis @YeetingNoodle -plug i...     TripleStrucK   
1   RT @artimaaaeus: ghost and cpt mactavish but i...      artimaaaeus   
2   RT @loneghostwolf88: 4th entry of the beach bo...  loneghostwolf88   
3   RT @AmikoRoyAi: Caught flirting on a mission 😮...       AmikoRoyAi   
4   🔴 LIVE - CALL OF DUTY: MODERN WARFARE II Seaso...          YouTube   
..                                                ...              ...   
88  RT @ozyma

In [31]:
# save sentiment count results
comment_df.to_csv(save_filename)
sentiment_count_df.to_csv('sentiment_count.csv')   