In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.utils import resample
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
#read csv files
df_tweets = pd.read_csv('train.csv',index_col='tweetid')
df_kaggle_test = pd.read_csv('test.csv',index_col='tweetid')

In [3]:
#determine class balance
sentiment_count = df_tweets[['sentiment', 'message']].groupby('sentiment').count()
sentiment = pd.DataFrame({'sentiment_description':['anti','neutral','pro','news']},index=[-1,0,1,2])
sentiment.join(sentiment_count)

Unnamed: 0,sentiment_description,message
-1,anti,1296
0,neutral,2353
1,pro,8530
2,news,3640


In [4]:
news_2 = df_tweets[df_tweets['sentiment']==2]
pro_1 = df_tweets[df_tweets['sentiment']==1]
neutral_0 = df_tweets[df_tweets['sentiment']==0]
anti_n1 = df_tweets[df_tweets['sentiment']==-1]

In [5]:
# Downsample majority

class_size = int(len(pro_1)/2)

pro_1_resampled = resample(pro_1,
                         replace=False, # sample without replacement (no need to duplicate observations)
                         n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible result

news_2_resampled = resample(news_2,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible results

neutral_0_resampled = resample(neutral_0,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible results

anti_n1_resampled = resample(anti_n1,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible results 

In [6]:
df_train = pd.concat([pro_1_resampled, news_2_resampled, neutral_0_resampled, anti_n1_resampled])
len(df_train)
#df_tweets = df

17060

In [7]:
#lower case formatting
df_train['message'] = df_train['message'].str.lower()
df_kaggle_test['message'] = df_kaggle_test['message'].str.lower()
#df_kaggle.head(20)

In [8]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df_train['message'] = df_train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
df_kaggle_test['message'] = df_kaggle_test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [19]:
df_train.iloc[0]


sentiment                                                    1
message      rt @ubcforestry: funding from @genomebc will s...
Name: 977844, dtype: object

In [20]:
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

df_train['message'] = df_train['message'].apply(remove_punctuation_numbers)
df_kaggle_test['message'] = df_kaggle_test['message'].apply(remove_punctuation_numbers)


In [21]:
#Tokenize word
tokeniser = TreebankWordTokenizer()
df_train['tokens'] = df_train['message'].apply(tokeniser.tokenize)
df_kaggle_test['tokens'] = df_kaggle_test['message'].apply(tokeniser.tokenize)

In [22]:
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

In [23]:
df_train['tokens'] = df_train['tokens'].apply(remove_stop_words)
df_kaggle_test['tokens'] = df_kaggle_test['tokens'].apply(remove_stop_words)

In [24]:
def scrub_words(text):
    """Basic cleaning of texts."""
    clean_text = []
    for word in text:

    # remove html markup
        word=re.sub("(<.*?>)","",word)
    
    #remove non-ascii and digits
        word=re.sub("(\\W|\\d)"," ",word)
        word=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",word) # normalize certain words
    #remove whitespace
        word=word.strip()
        clean_text.append(word)
    return clean_text

In [25]:
df_train['clean_tokens'] = df_train['tokens'].apply(scrub_words)
df_kaggle_test['clean_tokens'] = df_kaggle_test['tokens'].apply(scrub_words)

In [26]:
lemmatizer = WordNetLemmatizer()

In [27]:
def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]    

In [28]:
df_train['lemma_tokens'] = df_train['clean_tokens'].apply(mbti_lemma, args=(lemmatizer, ))
df_kaggle_test['lemma_tokens'] = df_kaggle_test['clean_tokens'].apply(mbti_lemma, args=(lemmatizer, ))

In [29]:
df_train['clean_message'] = [' '.join(map(str, l)) for l in df_train['lemma_tokens']]
df_kaggle_test['clean_message'] = [' '.join(map(str, l)) for l in df_kaggle_test['lemma_tokens']]

In [30]:
df_train

Unnamed: 0_level_0,sentiment,message,tokens,clean_tokens,lemma_tokens,clean_message
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
977844,1,rt ubcforestry funding from genomebc will supp...,"[rt, ubcforestry, funding, genomebc, support, ...","[rt, ubcforestry, funding, genomebc, support, ...","[rt, ubcforestry, funding, genomebc, support, ...",rt ubcforestry funding genomebc support sallyn...
441956,1,yadimoiina gag orders sure hes definitely gree...,"[yadimoiina, gag, orders, sure, hes, definitel...","[yadimoiina, gag, orders, sure, hes, definitel...","[yadimoiina, gag, order, sure, he, definitely,...",yadimoiina gag order sure he definitely green ...
978938,1,rt pattonoswalt not ominous at all he also wan...,"[rt, pattonoswalt, ominous, also, wants, names...","[rt, pattonoswalt, ominous, also, wants, names...","[rt, pattonoswalt, ominous, also, want, name, ...",rt pattonoswalt ominous also want name anyone ...
587737,1,rt melissajpeltier in case you forgot about th...,"[rt, melissajpeltier, case, forgot, chinese, h...","[rt, melissajpeltier, case, forgot, chinese, h...","[rt, melissajpeltier, case, forgot, chinese, h...",rt melissajpeltier case forgot chinese hoax gl...
804767,1,rt sethmacfarlane hrc proposes installing half...,"[rt, sethmacfarlane, hrc, proposes, installing...","[rt, sethmacfarlane, hrc, proposes, installing...","[rt, sethmacfarlane, hrc, proposes, installing...",rt sethmacfarlane hrc proposes installing half...
...,...,...,...,...,...,...
517059,-1,the priority for most africans is getting food...,"[priority, africans, getting, food, empty, tum...","[priority, africans, getting, food, empty, tum...","[priority, african, getting, food, empty, tumm...",priority african getting food empty tummy ever...
759713,-1,rt realdonaldtrump the concept of global warmi...,"[rt, realdonaldtrump, concept, global, warming...","[rt, realdonaldtrump, concept, global, warming...","[rt, realdonaldtrump, concept, global, warming...",rt realdonaldtrump concept global warming crea...
189585,-1,rt cattharmony id rather marchforbabies than m...,"[rt, cattharmony, id, rather, marchforbabies, ...","[rt, cattharmony, id, rather, marchforbabies, ...","[rt, cattharmony, id, rather, marchforbabies, ...",rt cattharmony id rather marchforbabies march ...
763763,-1,rt loftyjester of course they have fuck all to...,"[rt, loftyjester, course, fuck, fake, climate,...","[rt, loftyjester, course, fuck, fake, climate,...","[rt, loftyjester, course, fuck, fake, climate,...",rt loftyjester course fuck fake climate change...


In [31]:
betterVect = TfidfVectorizer(ngram_range=(1,2), min_df=2,max_df=0.5, stop_words="english")

In [32]:
X_vect = betterVect.fit_transform(df_train['clean_message'])
X_kaggle = betterVect.transform(df_kaggle_test['clean_message'])

In [33]:
X = X_vect.toarray()
y = df_train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [35]:
f1_score(y_test, y_pred, average="macro")

0.840435277484903

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.96      0.94      0.95      1251
           0       0.78      0.89      0.83      1275
           1       0.81      0.67      0.73      1292
           2       0.83      0.88      0.86      1300

    accuracy                           0.84      5118
   macro avg       0.84      0.84      0.84      5118
weighted avg       0.84      0.84      0.84      5118



In [37]:
result_frame = {'tweetid' : df_kaggle_test.index,'sentiment' : rfc.predict(X_kaggle).astype(int)}
df_result = pd.DataFrame(result_frame)
df_result.set_index('tweetid',inplace=True)
df_result["sentiment"].unique()

array([ 2,  1,  0, -1], dtype=int64)

In [38]:
df_result

Unnamed: 0_level_0,sentiment
tweetid,Unnamed: 1_level_1
169760,2
35326,1
224985,1
476263,1
872928,0
...,...
895714,1
875167,1
78329,1
867455,0


In [39]:
df_result.to_csv('KaggleSubmission_20201018_07.csv')