In [1]:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from scipy.io import arff
import pandas as pd
import math
import os
import io
import regex
from tabulate import tabulate

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/CayJoBla/EmotionRecognition/master/data/original_data.csv', index_col=0)
tweet_data = df.copy()
# New classes
tweet_data.loc[tweet_data["sentiment"]=="empty", "sentiment"] = "sadness"
tweet_data.loc[tweet_data["sentiment"]=="boredom", "sentiment"] = "neutral"
tweet_data.loc[tweet_data["sentiment"]=="hate", "sentiment"] = "anger"
tweet_data.loc[tweet_data["sentiment"]=="fun", "sentiment"] = "enthusiasm"
np.sort(pd.unique(tweet_data['sentiment']))

array(['anger', 'enthusiasm', 'happiness', 'love', 'neutral', 'relief',
       'sadness', 'surprise', 'worry'], dtype=object)

In [3]:
# Define identifiers
urls = r"http:[^\s]+|w{3}\.[^\s]+|[\w]+\.[\w]{3}[/\w]*"
users = r"@(\w){1,}"
tags = r"#(\w){1,}"
symbols = r"[&[\w]+;|~$/%/^/&/*-/+/=/\/|///?:;\"<>,]|[.]+"
words = r"[\w]+"

# Parse unique identifiers
tweet_data['num_urls'] = tweet_data['content'].apply(lambda text: len(regex.findall(urls, text.lower()))).values
tweet_data['content_parsed'] = tweet_data['content'].apply(lambda text: regex.sub(urls, "", text)).values
tweet_data['num_pings'] = tweet_data['content_parsed'].apply(lambda text: len(regex.findall(users, text.lower()))).values
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: regex.sub(users, "", text)).values
tweet_data['num_tags'] = tweet_data['content_parsed'].apply(lambda text: len(regex.findall(tags, text.lower()))).values
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: regex.sub(tags, "", text)).values
tweet_data['#!'] = tweet_data['content_parsed'].str.count('!')
tweet_data['#?'] = tweet_data['content_parsed'].str.count('\\?')
tweet_data['#...'] = tweet_data['content_parsed'].str.count('\\.{2,}')
tweet_data['Upper/Lower ratio'] = (tweet_data['content_parsed'].str.findall(r'[A-Z]').str.len()+1) / (tweet_data['content_parsed'].str.findall(r'[a-z]').str.len()+1)
tweet_data['Number of Words'] = tweet_data['content_parsed'].apply(lambda n: len(n.split()))
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: text.replace(';', '; '))
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: text.replace('&', ' &'))
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: text.replace('_', ' '))
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: text.replace('ï', ''))
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: text.replace('½', ''))
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: regex.sub(symbols, " ", text)).values
tweet_data['content_parsed'] = tweet_data['content_parsed'].apply(lambda text: regex.sub(r"'", "", text)).values
tweet_data['words'] = tweet_data['content_parsed'].apply(lambda text: regex.findall(words, text.lower())).values
tweet_data["avg word length"] = tweet_data['words'].apply(lambda word_array: np.mean([len(word) for word in word_array]))
# Encode
tweet_data["sentiment"] = tweet_data["sentiment"].astype('category').cat.codes

#remove unwanted columns
# tweet_data = tweet_data.drop(['content', 'content_parsed'], axis=1)
tweet_data = tweet_data[tweet_data['avg word length'].notna()]
tweet_data

  out=out, **kwargs)


Unnamed: 0_level_0,sentiment,content,num_urls,content_parsed,num_pings,num_tags,#!,#?,#...,Upper/Lower ratio,Number of Words,words,avg word length
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1956967341,6,@tiffanylue i know i was listenin to bad habi...,0,i know i was listenin to bad habit earlier a...,1,0,0,0,0,0.016129,17,"[i, know, i, was, listenin, to, bad, habit, ea...",3.812500
1956967666,6,Layin n bed with a headache ughhhh...waitin o...,0,Layin n bed with a headache ughhhh waitin on ...,0,0,0,0,2,0.045455,10,"[layin, n, bed, with, a, headache, ughhhh, wai...",4.000000
1956967696,6,Funeral ceremony...gloomy friday...,0,Funeral ceremony gloomy friday,0,0,0,0,2,0.074074,3,"[funeral, ceremony, gloomy, friday]",6.750000
1956967789,1,wants to hang out with friends SOON!,0,wants to hang out with friends SOON!,0,0,1,0,0,0.192308,7,"[wants, to, hang, out, with, friends, soon]",4.142857
1956968416,4,@dannycastillo We want to trade with someone w...,0,We want to trade with someone who has Houston...,1,0,0,0,0,0.054545,14,"[we, want, to, trade, with, someone, who, has,...",4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1753918900,2,Succesfully following Tayla!!,0,Succesfully following Tayla!!,0,0,2,0,0,0.125000,3,"[succesfully, following, tayla]",8.333333
1753919001,3,Happy Mothers Day All my love,0,Happy Mothers Day All my love,0,0,0,0,0,0.238095,6,"[happy, mothers, day, all, my, love]",4.000000
1753919005,3,Happy Mother's Day to all the mommies out ther...,0,Happy Mothers Day to all the mommies out there...,0,0,1,0,0,0.043956,25,"[happy, mothers, day, to, all, the, mommies, o...",3.720000
1753919043,2,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY ...,1,0,7,0,0,74.000000,18,"[wassup, beautiful, follow, me, peep, out, my,...",3.842105


In [4]:
#look for trigger words
# pd.value_counts(tweet_data['words'])
# tweet_data['happy'] = tweet_data['content'].str.count(' i ')
string_list = [" happy ", " sad ", " cry", " thanks ", " hate ", " miss ", " missing ", " hurt ", " love", " suck ", " hope ", " death ", "sorry", " pain ", " i "]

badWords = ["damn ", " omg ", "shit", "fuc"]
negativeWords = [" not ", " cant ", " cannot ", "nothin"]

newData = tweet_data

for col in string_list:
  newData[col] = newData['content_parsed'].str.count(col)

# for col in badWords:
#   tweet_data["bad words"] = tweet_data['content_parsed'].str.count(col)
newData

Unnamed: 0_level_0,sentiment,content,num_urls,content_parsed,num_pings,num_tags,#!,#?,#...,Upper/Lower ratio,...,miss,missing,hurt,love,suck,hope,death,sorry,pain,i
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1956967341,6,@tiffanylue i know i was listenin to bad habi...,0,i know i was listenin to bad habit earlier a...,1,0,0,0,0,0.016129,...,0,0,0,0,0,0,0,0,0,3
1956967666,6,Layin n bed with a headache ughhhh...waitin o...,0,Layin n bed with a headache ughhhh waitin on ...,0,0,0,0,2,0.045455,...,0,0,0,0,0,0,0,0,0,0
1956967696,6,Funeral ceremony...gloomy friday...,0,Funeral ceremony gloomy friday,0,0,0,0,2,0.074074,...,0,0,0,0,0,0,0,0,0,0
1956967789,1,wants to hang out with friends SOON!,0,wants to hang out with friends SOON!,0,0,1,0,0,0.192308,...,0,0,0,0,0,0,0,0,0,0
1956968416,4,@dannycastillo We want to trade with someone w...,0,We want to trade with someone who has Houston...,1,0,0,0,0,0.054545,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1753918900,2,Succesfully following Tayla!!,0,Succesfully following Tayla!!,0,0,2,0,0,0.125000,...,0,0,0,0,0,0,0,0,0,0
1753919001,3,Happy Mothers Day All my love,0,Happy Mothers Day All my love,0,0,0,0,0,0.238095,...,0,0,0,1,0,0,0,0,0,0
1753919005,3,Happy Mother's Day to all the mommies out ther...,0,Happy Mothers Day to all the mommies out there...,0,0,1,0,0,0.043956,...,0,0,0,0,0,0,0,0,0,0
1753919043,2,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY ...,1,0,7,0,0,74.000000,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Gets data ready for testing...
cut_data = newData.drop(['content', 'content_parsed', 'words'], axis=1)
cut_data

Unnamed: 0_level_0,sentiment,num_urls,num_pings,num_tags,#!,#?,#...,Upper/Lower ratio,Number of Words,avg word length,...,miss,missing,hurt,love,suck,hope,death,sorry,pain,i
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1956967341,6,0,1,0,0,0,0,0.016129,17,3.812500,...,0,0,0,0,0,0,0,0,0,3
1956967666,6,0,0,0,0,0,2,0.045455,10,4.000000,...,0,0,0,0,0,0,0,0,0,0
1956967696,6,0,0,0,0,0,2,0.074074,3,6.750000,...,0,0,0,0,0,0,0,0,0,0
1956967789,1,0,0,0,1,0,0,0.192308,7,4.142857,...,0,0,0,0,0,0,0,0,0,0
1956968416,4,0,1,0,0,0,0,0.054545,14,4.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1753918900,2,0,0,0,2,0,0,0.125000,3,8.333333,...,0,0,0,0,0,0,0,0,0,0
1753919001,3,0,0,0,0,0,0,0.238095,6,4.000000,...,0,0,0,1,0,0,0,0,0,0
1753919005,3,0,0,0,1,0,0,0.043956,25,3.720000,...,0,0,0,0,0,0,0,0,0,0
1753919043,2,1,1,0,7,0,0,74.000000,18,3.842105,...,0,0,0,0,0,0,0,0,0,0


In [8]:
tweet_np = np.array(cut_data)

grab70 = round(len(tweet_np) * .7)
emotionColumns = ['num_urls', 'num_pings', 'num_tags', '#!', '#?', '#...', 'Upper/Lower ratio', 'Number of Words', 
                  'avg word length', 'happy', 'sad', 'cry', 'thanks', 'hate', 'miss', 'missing', 'hurt', 'love', 
                  'suck', 'hope', 'death', 'sorry', 'pain', "i", "bad words"]
headers = ["Iterations", "Accuracy"]
table = []
aveAccuracy = 0

for i in range(3):
  print("k=", i)
  np.random.shuffle(tweet_np)

  trainX = tweet_np[:grab70, 1:]
  trainY = tweet_np[:grab70, 0]

  testX = tweet_np[grab70:, 1:]
  testY = tweet_np[grab70:, 0]

  clf = RandomForestClassifier(max_depth = 10, random_state=0, n_estimators = 100)
  clf.fit(trainX, trainY)

  Accuracy = clf.score(testX, testY)
  aveAccuracy += Accuracy
  column = i, Accuracy
  table.append(column)

aveAccuracy = aveAccuracy / 3
column = "Average", aveAccuracy
table.append(column)

# print("\nAccuracy for housing data is", houseAccuracy)
from IPython.display import HTML, display
display(HTML(tabulate(table, headers=headers, tablefmt='html')))

# print("Accuracy is", Accuracy)

k= 0
k= 1
k= 2


Iterations,Accuracy
0,0.294486
1,0.297911
2,0.295823
Average,0.296074


In [7]:
from sklearn.datasets import load_iris
from sklearn import tree

# plt.figure(figsize=(10,10))
# tree.plot_tree(clf, filled = True)
# plt.show()

text_representation = tree.export_text(clf, feature_names = emotionColumns)
print(text_representation)

AttributeError: ignored

In [None]:

# from sklearn.datasets import load_iris
# iris = load_iris()

# # Model (can also use single decision tree)
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=10)

# # Train
# model.fit(trainX, trainY)
# # Extract single tree
# estimator = model.estimators_[5]

# featureNames = ['num_urls', 'num_pings', 'num_tags', '#!', '#?', '#...', 'Upper/Lower ratio', 'Number of Words', 'avg word length']

# from sklearn.tree import export_graphviz
# # Export as dot file
# export_graphviz(estimator, out_file='tree.dot', 
#                 feature_names = featureNames,
#                 # class_names = iris.target_names,
#                 rounded = True, proportion = False, 
#                 precision = 2, filled = True)

# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# # Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')