In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.utils import resample

from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

from sklearn.svm import SVC


In [2]:
#read csv files
df_tweets = pd.read_csv('train.csv',index_col='tweetid')
df_kaggle_test = pd.read_csv('test.csv',index_col='tweetid')

In [3]:
#determine class balance
sentiment_count = df_tweets[['sentiment', 'message']].groupby('sentiment').count()
sentiment = pd.DataFrame({'sentiment_description':['anti','neutral','pro','news']},index=[-1,0,1,2])
sentiment.join(sentiment_count)

Unnamed: 0,sentiment_description,message
-1,anti,1296
0,neutral,2353
1,pro,8530
2,news,3640


In [4]:
news_2 = df_tweets[df_tweets['sentiment']==2]
pro_1 = df_tweets[df_tweets['sentiment']==1]
neutral_0 = df_tweets[df_tweets['sentiment']==0]
anti_n1 = df_tweets[df_tweets['sentiment']==-1]

In [5]:
# Downsample majority & Upsample minority

class_size = int(len(pro_1)/2)

pro_1_resampled = resample(pro_1,
                         replace=False, # sample without replacement (no need to duplicate observations)
                         n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible result

news_2_resampled = resample(news_2,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible results

neutral_0_resampled = resample(neutral_0,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible results

anti_n1_resampled = resample(anti_n1,
                          replace=True, # sample with replacement (we need to duplicate observations)
                          n_samples=class_size, # match number in minority class
                          random_state=27) # reproducible results 

In [6]:
df_train = pd.concat([pro_1_resampled, news_2_resampled, neutral_0_resampled, anti_n1_resampled])
len(df_train)
#df_tweets = df

17060

In [7]:
#lower case formatting
df_train['message'] = df_train['message'].str.lower()
df_kaggle_test['message'] = df_kaggle_test['message'].str.lower()
#df_kaggle.head(20)

In [8]:
#replace urls
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df_train['message'] = df_train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
df_kaggle_test['message'] = df_kaggle_test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [9]:
#remove punctuation & digits
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

df_train['message'] = df_train['message'].apply(remove_punctuation_numbers)
df_kaggle_test['message'] = df_kaggle_test['message'].apply(remove_punctuation_numbers)


In [10]:
#lemmatizer = WordNetLemmatizer()
porter_stemmer=PorterStemmer()
def message_preprocessing(text):

    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    text=re.sub("(<.*?>)","",text)
    
    # stem words
    words=re.split("\\s+",text)
    #lemmed_words=[lemmatizer.lemmatize(word=word) for word in words]
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

In [11]:
df_train

Unnamed: 0_level_0,sentiment,message
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
977844,1,rt ubcforestry funding from genomebc will supp...
441956,1,yadimoiina gag orders sure hes definitely gree...
978938,1,rt pattonoswalt not ominous at all he also wan...
587737,1,rt melissajpeltier in case you forgot about th...
804767,1,rt sethmacfarlane hrc proposes installing half...
...,...,...
517059,-1,the priority for most africans is getting food...
759713,-1,rt realdonaldtrump the concept of global warmi...
189585,-1,rt cattharmony id rather marchforbabies than m...
763763,-1,rt loftyjester of course they have fuck all to...


In [12]:
betterVect = TfidfVectorizer(binary=True, stop_words="english",preprocessor=message_preprocessing)

In [13]:
X_vect = betterVect.fit_transform(df_train['message'])
X_kaggle = betterVect.transform(df_kaggle_test['message'])
X_vect

  'stop_words.' % sorted(inconsistent))


<17060x14907 sparse matrix of type '<class 'numpy.float64'>'
	with 203402 stored elements in Compressed Sparse Row format>

In [14]:
X = X_vect
y = df_train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

<13648x14907 sparse matrix of type '<class 'numpy.float64'>'
	with 162955 stored elements in Compressed Sparse Row format>

In [15]:
svc = SVC(gamma=2, C=1)
svc.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [16]:
y_pred = svc.predict(X_test)
#y_pred = svc.predict(X_test)

In [17]:
f1_score(y_test, y_pred, average='micro')

0.8962485345838218

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.99      0.95      0.97       824
           0       0.93      0.87      0.90       876
           1       0.76      0.89      0.82       851
           2       0.93      0.88      0.90       861

    accuracy                           0.90      3412
   macro avg       0.90      0.90      0.90      3412
weighted avg       0.90      0.90      0.90      3412



In [19]:
result_frame = {'tweetid' : df_kaggle_test.index,'sentiment' : svc.predict(X_kaggle).astype(int)}
df_result = pd.DataFrame(result_frame)
df_result.set_index('tweetid',inplace=True)
df_result["sentiment"].unique()

array([ 1,  0,  2, -1], dtype=int64)

In [21]:
df_result

Unnamed: 0_level_0,sentiment
tweetid,Unnamed: 1_level_1
169760,1
35326,1
224985,1
476263,1
872928,0
...,...
895714,1
875167,1
78329,2
867455,0


In [22]:
df_result.to_csv('KaggleSubmission_20201019_16.csv')