In [75]:
# Import packages
import csv
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Fan Kai
[nltk_data]     Jie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Fan Kai
[nltk_data]     Jie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [76]:
# Load tweets data
data = []
with open('tweets_DM.json', 'r') as f:
    for i in f:
        data.append(json.loads(i))

# Flatten json into dataframe
df = pd.json_normalize(data)

In [77]:
# Rename _source.tweet.tweet_id to tweet_id for merging with emotion and identification data later
df.rename(columns={'_source.tweet.tweet_id' : 'tweet_id'}, inplace=True)

In [78]:
# Load emotion and identification data
emotion = pd.read_csv('emotion.csv')
identification = pd.read_csv('data_identification.csv')

In [79]:
# Visualise emotion data
emotion

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [80]:
# Visualise identification data
identification

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [81]:
# Visualise dataframe
df

Unnamed: 0,_score,_index,_crawldate,_type,_source.tweet.hashtags,tweet_id,_source.tweet.text
0,391,hashtag_tweets,2015-05-23 11:42:47,tweets,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,433,hashtag_tweets,2016-01-28 04:52:09,tweets,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,232,hashtag_tweets,2017-12-25 04:39:20,tweets,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k..."
3,376,hashtag_tweets,2016-01-24 23:53:05,tweets,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,989,hashtag_tweets,2016-01-08 17:18:59,tweets,[],0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...,...,...,...,...,...
1867530,827,hashtag_tweets,2015-05-12 12:51:52,tweets,"[mixedfeeling, butimTHATperson]",0x316b80,When you buy the last 2 tickets remaining for ...
1867531,368,hashtag_tweets,2017-10-02 17:54:04,tweets,[],0x29d0cb,I swear all this hard work gone pay off one da...
1867532,498,hashtag_tweets,2016-10-10 11:04:32,tweets,[],0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,840,hashtag_tweets,2016-09-02 14:25:06,tweets,[],0x24faed,"Ah, corporate life, where you can date <LH> us..."


In [82]:
# Merge df and identification data first as they have the same number of rows
overall = pd.merge(df, identification, on='tweet_id')

In [83]:
# Visualise dataframe after 1st merging
overall

Unnamed: 0,_score,_index,_crawldate,_type,_source.tweet.hashtags,tweet_id,_source.tweet.text,identification
0,391,hashtag_tweets,2015-05-23 11:42:47,tweets,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,433,hashtag_tweets,2016-01-28 04:52:09,tweets,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,232,hashtag_tweets,2017-12-25 04:39:20,tweets,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
3,376,hashtag_tweets,2016-01-24 23:53:05,tweets,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
4,989,hashtag_tweets,2016-01-08 17:18:59,tweets,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test
...,...,...,...,...,...,...,...,...
1867530,827,hashtag_tweets,2015-05-12 12:51:52,tweets,"[mixedfeeling, butimTHATperson]",0x316b80,When you buy the last 2 tickets remaining for ...,test
1867531,368,hashtag_tweets,2017-10-02 17:54:04,tweets,[],0x29d0cb,I swear all this hard work gone pay off one da...,test
1867532,498,hashtag_tweets,2016-10-10 11:04:32,tweets,[],0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...,test
1867533,840,hashtag_tweets,2016-09-02 14:25:06,tweets,[],0x24faed,"Ah, corporate life, where you can date <LH> us...",train


In [85]:
# Split overall dataframe into train and test dataframes
train = overall[overall['identification'] == 'train']
test = overall[overall['identification'] == 'test']

In [86]:
# Visualise train dataframe
train

Unnamed: 0,_score,_index,_crawldate,_type,_source.tweet.hashtags,tweet_id,_source.tweet.text,identification
0,391,hashtag_tweets,2015-05-23 11:42:47,tweets,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,433,hashtag_tweets,2016-01-28 04:52:09,tweets,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
3,376,hashtag_tweets,2016-01-24 23:53:05,tweets,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
5,120,hashtag_tweets,2015-06-11 04:44:05,tweets,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train
6,1021,hashtag_tweets,2015-08-18 02:30:07,tweets,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,train
...,...,...,...,...,...,...,...,...
1867526,94,hashtag_tweets,2016-12-26 02:44:07,tweets,"[NoWonder, Happy]",0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,train
1867527,627,hashtag_tweets,2015-04-01 08:14:56,tweets,[],0x38959e,In every circumtance I'd like to be thankful t...,train
1867528,274,hashtag_tweets,2016-11-17 23:46:22,tweets,[blessyou],0x2cbca6,there's currently two girls walking around the...,train
1867533,840,hashtag_tweets,2016-09-02 14:25:06,tweets,[],0x24faed,"Ah, corporate life, where you can date <LH> us...",train


In [87]:
# Merge train dataframe with emotion dataframe only as emotion dataframe only consists of training labels
train = pd.merge(train, emotion, on='tweet_id')

In [88]:
# Split the labels column out after merging, so that the index is aligned with the train dataframe
y_train = train.pop('emotion').to_frame()

In [90]:
# Visualise y_train dataframe
y_train

Unnamed: 0,emotion
0,anticipation
1,sadness
2,fear
3,joy
4,anticipation
...,...
1455558,joy
1455559,joy
1455560,joy
1455561,joy


In [89]:
# Visualise train dataframe to confirm that it is only left with the features
train

Unnamed: 0,_score,_index,_crawldate,_type,_source.tweet.hashtags,tweet_id,_source.tweet.text,identification
0,391,hashtag_tweets,2015-05-23 11:42:47,tweets,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,433,hashtag_tweets,2016-01-28 04:52:09,tweets,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,376,hashtag_tweets,2016-01-24 23:53:05,tweets,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
3,120,hashtag_tweets,2015-06-11 04:44:05,tweets,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train
4,1021,hashtag_tweets,2015-08-18 02:30:07,tweets,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,train
...,...,...,...,...,...,...,...,...
1455558,94,hashtag_tweets,2016-12-26 02:44:07,tweets,"[NoWonder, Happy]",0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,train
1455559,627,hashtag_tweets,2015-04-01 08:14:56,tweets,[],0x38959e,In every circumtance I'd like to be thankful t...,train
1455560,274,hashtag_tweets,2016-11-17 23:46:22,tweets,[blessyou],0x2cbca6,there's currently two girls walking around the...,train
1455561,840,hashtag_tweets,2016-09-02 14:25:06,tweets,[],0x24faed,"Ah, corporate life, where you can date <LH> us...",train


In [91]:
# Visualise test dataframe after merging
test

Unnamed: 0,_score,_index,_crawldate,_type,_source.tweet.hashtags,tweet_id,_source.tweet.text,identification
2,232,hashtag_tweets,2017-12-25 04:39:20,tweets,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
4,989,hashtag_tweets,2016-01-08 17:18:59,tweets,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test
9,66,hashtag_tweets,2015-09-09 09:22:55,tweets,"[materialism, money, possessions]",0x218443,When do you have enough ? When are you satisfi...,test
30,104,hashtag_tweets,2015-10-10 14:33:26,tweets,"[GodsPlan, GodsWork]",0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
33,310,hashtag_tweets,2016-10-23 08:49:50,tweets,[],0x26289a,"In these tough times, who do YOU turn to as yo...",test
...,...,...,...,...,...,...,...,...
1867525,602,hashtag_tweets,2016-12-10 18:01:00,tweets,[],0x2913b4,"""For this is the message that ye heard from th...",test
1867529,598,hashtag_tweets,2015-01-04 14:40:55,tweets,[],0x2a980e,"""There is a lad here, which hath five barley l...",test
1867530,827,hashtag_tweets,2015-05-12 12:51:52,tweets,"[mixedfeeling, butimTHATperson]",0x316b80,When you buy the last 2 tickets remaining for ...,test
1867531,368,hashtag_tweets,2017-10-02 17:54:04,tweets,[],0x29d0cb,I swear all this hard work gone pay off one da...,test


In [92]:
# Check if there are any missing values in train dataframe
train.isna().sum()

# Conclusion: There are no missing values in train dataframe

_score                    0
_index                    0
_crawldate                0
_type                     0
_source.tweet.hashtags    0
tweet_id                  0
_source.tweet.text        0
identification            0
dtype: int64

In [93]:
# Check if there are any missing values in test dataframe
test.isna().sum()

# Conclusion: There are no missing values in test dataframe

_score                    0
_index                    0
_crawldate                0
_type                     0
_source.tweet.hashtags    0
tweet_id                  0
_source.tweet.text        0
identification            0
dtype: int64

In [95]:
# To pre-process tweets by removing irrelavant characters and standardise all to lowercase
def preprocess_tweet(tweet):
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE) # To remove URLs
    tweet = re.sub(r'\@w+', '', tweet) # To remove mentions
    tweet = re.sub(r'#', '', tweet) # To remove hashtags
    tweet = re.sub(r'\d+', '', tweet) # To remove numbers
    tweet = tweet.lower() # Convert to lowercase
    tweet = re.sub(r'\s+', ' ', tweet).strip() # To remove extra whitespace
    return tweet

In [96]:
# Pre-process tweets in train dataframe
train_processed = [preprocess_tweet(r['_source.tweet.text']) for i, r in train.iterrows()]

In [97]:
# Visualise train_processed dataframe
train_processed

['people who post "add me on snapchat" must be dehydrated. cuz man.... that\'s <lh>',
 '@brianklaas as we see, trump is dangerous to freepress around the world. what a <lh> <lh> trumplegacy. cnn',
 'now issa is stalking tasha 😂😂😂 <lh>',
 '@riskshow @thekevinallison thx for the best time tonight. what stories! heartbreakingly <lh> authentic laughoutloud good!!',
 'still waiting on those supplies liscus. <lh>',
 'love knows no gender. 😢😭 <lh>',
 '@dstvngcare @dstvng more highlights are being shown than actual sports! who watches triathlon highlights anyway? <lh> leaguecup',
 'the ssm debate; <lh> (a manufactured fantasy used to distract the ignorant masses from their mundane lives) v gender diversity (a m......',
 "i love suffering 🙃🙃 i love when valium does nothing to help 🙃🙃 i love when my doctors say that they've done all they can 🙃🙃 <lh>",
 'can someone tell my why my feeds scroll back to the same tweets that i saw min ago? pissed!',
 'you know you research butterflies when predictiv

In [98]:
# As what I did for train dataframe, I pre-process tweets in test dataframe as well
test_processed = [preprocess_tweet(r['_source.tweet.text']) for i, r in test.iterrows()]

In [99]:
# Visualise test_processed dataframe
test_processed

['confident of your obedience, i write to you, knowing that you will do even more than i ask. (philemon :) / bibleverse <lh> <lh>',
 '"trust is not the same as faith. a friend is someone you trust. putting faith in anyone is a mistake." ~ christopher hitchens <lh> <lh>',
 'when do you have enough ? when are you satisfied ? is you goal really all about money ? materialism money possessions <lh>',
 'god woke you up, now chase the day godsplan godswork <lh>',
 'in these tough times, who do you turn to as your symbol of hope? <lh>',
 'turns out you can recognise people by their undies. <lh>',
 'i like how hayvens mommy, daddy, and the keyboard warriors have to jump into everything. she can’t handle anything herself. sheltered <lh>',
 'i just love it when every single one of my songs just delete themselves..😡😒 this is the rd times this has happened! <lh> notamused',
 '@juliechen when can we expect a season of celebritybigbrother i think that would be <lh>',
 'tbh. regret hurts more than ste

In [100]:
# Initialise the vectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words=stopwords.words('english'))

In [101]:
# Fit and transform train_processed dataframe
X = vectorizer.fit_transform(train_processed)

In [102]:
# Split the train_processed dataframe into train and validation dataframes
X_train, X_val, y_train, y_val = train_test_split(X, y_train, test_size=0.2, random_state=42)

In [103]:
# Initialise the RandomForest model 
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [104]:
# Train the model 
model.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  model.fit(X_train, y_train)


In [105]:
# Make predictions with validation data first
predictions = model.predict(X_val)

In [106]:
# Evaluate the model
print(classification_report(y_val, predictions))

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


              precision    recall  f1-score   support

       anger       0.43      0.10      0.16      7946
anticipation       0.59      0.48      0.53     49984
     disgust       0.38      0.26      0.31     27669
        fear       0.64      0.28      0.39     12846
         joy       0.49      0.81      0.61    102943
     sadness       0.47      0.38      0.42     38745
    surprise       0.58      0.17      0.26      9816
       trust       0.54      0.26      0.35     41164

    accuracy                           0.50    291113
   macro avg       0.52      0.34      0.38    291113
weighted avg       0.51      0.50      0.47    291113



In [107]:
# As what I did to train_processed dataframe, I fit and transform test_processed dataframe as well
X_test = vectorizer.fit_transform(test_processed)

In [108]:
# Make predictions with test data
actual_pred = model.predict(X_test)

In [109]:
# Visualise actual_pred array
actual_pred

array(['sadness', 'trust', 'sadness', ..., 'disgust', 'joy', 'sadness'],
      dtype=object)

In [113]:
# Check if it is the expected output, as after writing predcitions into csv file, the word is split up into columns. 
# For example, instead of 'sadness', it wrote 's', 'a', 'd', 'n', 'e', 's', 's' into the csv file.

actual_pred[0]

'sadness'

In [118]:
# Separate out the tweet_id
test_id = test['tweet_id']

In [122]:
# Check if test_id is in the format I want
list(test_id)

['0x28b412',
 '0x2de201',
 '0x218443',
 '0x2939d5',
 '0x26289a',
 '0x31c6e0',
 '0x32edee',
 '0x3714ee',
 '0x235628',
 '0x283024',
 '0x25dcd8',
 '0x33df6e',
 '0x2d6cb6',
 '0x21d36a',
 '0x34fc3e',
 '0x338b96',
 '0x38895c',
 '0x260d2b',
 '0x38c20a',
 '0x21ee14',
 '0x245e5b',
 '0x2fa1a8',
 '0x340bc5',
 '0x21114e',
 '0x32d429',
 '0x380533',
 '0x2e59dd',
 '0x32de62',
 '0x2f2f42',
 '0x2b3f7a',
 '0x1f0f0d',
 '0x1e0006',
 '0x24c180',
 '0x2c0cbb',
 '0x2b5304',
 '0x2213f8',
 '0x25ae82',
 '0x1dcc6b',
 '0x317bd3',
 '0x366e92',
 '0x3186fc',
 '0x2e190e',
 '0x2158b8',
 '0x3204bd',
 '0x2ec3b8',
 '0x203e91',
 '0x2d72d4',
 '0x20e76d',
 '0x2255e2',
 '0x1d6edf',
 '0x2f07b4',
 '0x20f8f2',
 '0x36627d',
 '0x1d863a',
 '0x1dc4b5',
 '0x318056',
 '0x2d4d7f',
 '0x24afcd',
 '0x2c9398',
 '0x22d1ce',
 '0x2940be',
 '0x32b37a',
 '0x1e52b7',
 '0x1cf9f5',
 '0x2b0197',
 '0x23c767',
 '0x27c6e5',
 '0x1ed702',
 '0x2b4cf6',
 '0x22e505',
 '0x20410d',
 '0x1fbc44',
 '0x27485a',
 '0x33bd17',
 '0x295aed',
 '0x2dea64',
 '0x3160c7',

In [116]:
# Write the predictions into the csv file first
csv_file_name = 'output.csv'

with open(csv_file_name, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    for row in actual_pred:
        if not isinstance(row, list):
            row = [row]  # Convert a single value into a list so that the spliting up of words do not occur
        csvwriter.writerow(row)

In [123]:
# Set headers
new_column_data = list(test_id)
new_column_header = 'id'
existing_column_header = 'emotion'

# Read the existing data from the above CSV file
existing_data = []
with open('output.csv', 'r', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        existing_data.append(row)

# Combine the new column data with the existing data
combined_data = [[new_column_header, existing_column_header]]
for i, row in enumerate(existing_data):
    combined_data.append([new_column_data[i], row[0]])

# Write the combined data to a new CSV file
with open('updated_emotions.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(combined_data)
