In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [3]:
emotions = pd.read_csv('../Data/text_emotion.csv')
emotions_copy = emotions.copy()
emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [4]:
drop_rows_lst = ['empty', 'enthusiasm', 'worry', 'surprise', 'fun', 'boredom', 'relief', 'anger']
emotions = emotions[~emotions['sentiment'].isin(drop_rows_lst)]
emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you


In [5]:
vals_to_replace = {'sadness': 1, 'neutral': 2, 'love': 4, 'hate': 0, 'happiness': 3}
emotions['sent_num'] = emotions.sentiment.map(vals_to_replace)
emotions['sent_num'].value_counts()

2    8638
3    5209
1    5165
4    3842
0    1323
Name: sent_num, dtype: int64

In [6]:
# auxiliar function to remove a pattern defined by a regular expression 
def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

# 3 specific cleaning functions to remove numbers, url's and special characters
def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", "", tweet) #add space placeholder

# general cleaning function to do it all at once
def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

In [7]:
emotions["content"] = emotions["content"].apply(clean_up)

In [8]:
stemmer = SnowballStemmer("english")

In [9]:
emotions["content_stemmed"] = emotions["content"].apply(stemmer.stem)

In [10]:
emotions["content_stemmed"] = [' '.join(x.split()) for x in emotions["content"]]

In [11]:
vectorizer = TfidfVectorizer()
content_vect = vectorizer.fit_transform(emotions.content_stemmed)
vector_df = pd.DataFrame.sparse.from_spmatrix(content_vect.tocoo(), columns = vectorizer.get_feature_names())

In [12]:
vector_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,aaaaaah,aaaaaalcohol,...,zur,zwriter,zyber,zykloid,zyote,zzerbe,zzz,zzzz,zzzzy,zzzzzzzgoodnight
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
emotions.reset_index(inplace=True)
emotions

Unnamed: 0,level_0,index,tweet_id,sentiment,author,content,sent_num,content_stemmed
0,0,1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,1,layin n bed with a headache ughhhhwaitin on yo...
1,1,2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,1,funeral ceremonygloomy friday
2,2,4,1956968416,neutral,xkilljoyx,dannycastillo we want to trade with someone wh...,2,dannycastillo we want to trade with someone wh...
3,3,6,1956968487,sadness,ShansBee,i should be sleep but im not thinking about an...,1,i should be sleep but im not thinking about an...
4,4,8,1956969035,sadness,nic0lepaula,charviray charlene my love i miss you,1,charviray charlene my love i miss you
...,...,...,...,...,...,...,...,...
24172,24172,39995,1753918954,neutral,showMe_Heaven,johnlloydtaylor,2,johnlloydtaylor
24173,24173,39996,1753919001,love,drapeaux,happy mothers day all my love,4,happy mothers day all my love
24174,24174,39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,4,happy mothers day to all the mommies out there...
24175,24175,39998,1753919043,happiness,ipdaman1,niariley wassup beautiful follow me peep out ...,3,niariley wassup beautiful follow me peep out m...


In [15]:
vector_df['sent_num'] = emotions['sent_num']
vector_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,aaaaaah,aaaaaalcohol,...,zwriter,zyber,zykloid,zyote,zzerbe,zzz,zzzz,zzzzy,zzzzzzzgoodnight,sent_num
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [16]:
train_set, test_set = train_test_split(vector_df, test_size=0.3)
train_X = train_set.drop(columns='sent_num')
train_y = train_set['sent_num']
test_X = test_set.drop(columns='sent_num')
test_y = test_set['sent_num']

### Random forest classifier

In [44]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=1000)
rfc.fit(train_X, train_y)
predictions_rfc = rfc.predict(train_X)

In [45]:
accuracy_score(train_y, predictions_rfc)

0.9960999822726467

In [47]:
predictions_rfc_test = rfc.predict(test_X)
accuracy_score(test_y, predictions_rfc_test)

0.5022056796250345

In [47]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_X, train_y)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

# Scale features

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
scaler.fit(train_X)
train_X = scaler.transform(train_X)

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [None]:
train_X

# Models Prediction

Linear Regression

In [35]:
model_LR = LinearRegression()
LR_fit = model_LR.fit(train_X, train_y)

In [36]:
model_LR.score(train_X, train_y)

0.9681858102069318

In [37]:
predictions_LR = model_LR.predict(train_X)
predictions_LR = [x.astype(int) for x in predictions_LR]
list(zip(predictions_LR, train_y))

[(3, 3),
 (3, 4),
 (2, 2),
 (1, 2),
 (0, 1),
 (2, 3),
 (1, 1),
 (1, 2),
 (1, 2),
 (4, 4),
 (3, 4),
 (3, 3),
 (1, 2),
 (1, 2),
 (0, 1),
 (4, 4),
 (3, 3),
 (1, 1),
 (0, 0),
 (3, 3),
 (1, 1),
 (2, 3),
 (1, 1),
 (1, 1),
 (0, 1),
 (0, 0),
 (1, 2),
 (2, 3),
 (4, 4),
 (2, 2),
 (2, 2),
 (1, 1),
 (0, 1),
 (1, 2),
 (2, 2),
 (1, 1),
 (1, 1),
 (2, 2),
 (0, 1),
 (3, 4),
 (2, 3),
 (0, 1),
 (2, 3),
 (2, 2),
 (3, 4),
 (1, 2),
 (1, 1),
 (1, 2),
 (1, 2),
 (1, 2),
 (3, 4),
 (1, 2),
 (2, 2),
 (2, 2),
 (3, 4),
 (2, 3),
 (2, 3),
 (1, 1),
 (1, 2),
 (3, 4),
 (1, 2),
 (1, 1),
 (3, 4),
 (2, 2),
 (4, 4),
 (0, 0),
 (1, 1),
 (1, 2),
 (3, 3),
 (1, 1),
 (2, 3),
 (2, 2),
 (2, 2),
 (0, 0),
 (2, 3),
 (1, 1),
 (2, 2),
 (0, 1),
 (4, 4),
 (1, 1),
 (3, 3),
 (1, 2),
 (2, 3),
 (3, 3),
 (0, 0),
 (3, 4),
 (2, 2),
 (3, 3),
 (2, 3),
 (3, 3),
 (2, 2),
 (1, 1),
 (3, 4),
 (0, 0),
 (2, 2),
 (2, 2),
 (4, 4),
 (4, 4),
 (2, 3),
 (1, 2),
 (1, 1),
 (3, 3),
 (1, 1),
 (1, 1),
 (2, 2),
 (1, 2),
 (1, 2),
 (2, 2),
 (1, 2),
 (1, 1),
 (2, 2),
 

In [38]:
predictions_LR_test = model_LR.predict(test_X)
predictions_LR_test = [x.astype(int) for x in predictions_LR_test]
list(zip(predictions_LR_test, test_y))

[(4, 3),
 (3, 2),
 (2, 3),
 (0, 2),
 (2, 2),
 (3, 2),
 (5, 1),
 (2, 2),
 (2, 1),
 (3, 2),
 (0, 2),
 (3, 4),
 (3, 1),
 (2, 4),
 (2, 3),
 (3, 1),
 (0, 1),
 (3, 3),
 (2, 1),
 (5, 4),
 (0, 1),
 (2, 4),
 (3, 4),
 (2, 1),
 (1, 1),
 (0, 1),
 (4, 3),
 (2, 1),
 (0, 1),
 (1, 3),
 (3, 2),
 (4, 4),
 (2, 2),
 (-1, 1),
 (-2, 1),
 (2, 2),
 (2, 3),
 (1, 3),
 (1, 0),
 (0, 2),
 (-1, 2),
 (1, 0),
 (1, 4),
 (1, 2),
 (4, 4),
 (2, 2),
 (2, 2),
 (2, 3),
 (0, 4),
 (0, 1),
 (0, 2),
 (0, 3),
 (2, 4),
 (6, 4),
 (3, 3),
 (1, 2),
 (2, 3),
 (3, 4),
 (-2, 2),
 (7, 2),
 (2, 2),
 (2, 3),
 (1, 4),
 (2, 3),
 (3, 2),
 (4, 1),
 (1, 2),
 (0, 4),
 (4, 4),
 (0, 0),
 (2, 4),
 (1, 4),
 (2, 2),
 (2, 4),
 (1, 1),
 (-1, 3),
 (3, 4),
 (0, 3),
 (3, 3),
 (1, 2),
 (0, 1),
 (3, 0),
 (4, 2),
 (2, 0),
 (0, 3),
 (2, 2),
 (3, 2),
 (2, 3),
 (1, 2),
 (1, 1),
 (5, 0),
 (1, 1),
 (1, 3),
 (1, 2),
 (0, 1),
 (3, 2),
 (2, 2),
 (2, 1),
 (0, 3),
 (4, 3),
 (1, 2),
 (4, 3),
 (1, 2),
 (2, 4),
 (2, 2),
 (1, 2),
 (2, 1),
 (3, 2),
 (1, 4),
 (0, 3),
 (0, 

In [39]:
model_LR.score(test_X, test_y)

-2.379195915444804

In [None]:
model_LR.score(test_X, test_y)

Logistic Regression

In [55]:
model_LogR = LogisticRegression(max_iter = 1000, multi_class="multinomial")
model_LogR.fit(train_X, train_y)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [56]:
model_LogR.score(train_X, train_y)


0.7114577793535425

In [59]:
predictions_LogR = model_LogR.predict(train_X)

In [60]:
#predictions_LogR = model_LR.predict(test_X)
#predictions_LogR = [x.astype(int) for x in predictions_LogR]
#list(zip(predictions_LR, test_y))
predictions_LogR

array([2, 4, 2, ..., 2, 2, 1], dtype=int64)

In [33]:
accuracy_score(train_y, predictions_LogR)

0.5239023813744608

In [48]:
from sklearn.metrics import confusion_matrix

In [50]:
confusion_matrix(test_y, predictions_LogR)

ValueError: Found input variables with inconsistent numbers of samples: [7254, 16923]

KNNeighbors

In [41]:
neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(train_X, train_y)
predictionsKNN = neigh.predict(train_X)
accuracy_score(train_y, predictionsKNN)

MemoryError: Unable to allocate 1.00 GiB for an array with shape (7931, 16923) and data type float64

In [15]:
featuresets = []

for index, row in emotions.iterrows():
    featuresets.append((build_features(row["content"])))

In [16]:
training, test = train_test_split(featuresets, test_size=0.2)
classifier = nltk.NaiveBayesClassifier.train(training)
classifier.show_most_informative_features()
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, test))*100.00)

ValueError: too many values to unpack (expected 2)

In [None]:
emotions.to_csv("../Data/train_data.csv")

In [None]:
stopwords_list = stopwords.words("english")
word_tokens = [word_tokenize(x) for x in emotions['content']]
emotions["content2"] = [x.replace(i, '') for x in emotions['content'] for i in word_tokens if i in stopwords_list]
emotions["content2"]