In [1]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')

In [3]:
df_dev['date'] = df_dev['date'].\
    astype('string')\
    .str.split(' ')\
    .apply(lambda x : ' '.join([x[i] for i in [1,2,3,5]]))\
    .pipe(pd.to_datetime)

### Stopwords analysis: sklearn VS nltk

#### nltk

In [4]:
from nltk import download as nltk_download

nltk_download('stopwords')

from nltk.corpus import stopwords

nltk_stopwords = stopwords.words('english')
print(stopwords.words('english'))
len(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

#### sklearn

In [5]:
from sklearn.feature_extraction import text

sklearn_stopwords = list(text.ENGLISH_STOP_WORDS)
print(list(text.ENGLISH_STOP_WORDS))
len(list(text.ENGLISH_STOP_WORDS))

['show', 'put', 'same', 'whole', 'twenty', 'always', 'below', 'behind', 'back', 'were', 'cry', 'there', 'against', 'other', 'go', 'further', 'ever', 'two', 'me', 'only', 'across', 'which', 'being', 'none', 'before', 'detail', 'eg', 'several', 'herself', 'its', 'or', 'never', 'former', 'wherein', 'after', 'six', 'via', 'somewhere', 'latter', 'he', 'can', 'every', 'whoever', 'nothing', 'least', 'had', 'became', 'well', 'nine', 'yours', 'thence', 'from', 'her', 'you', 'indeed', 'herein', 'often', 'sincere', 'do', 'on', 'with', 'may', 'around', 'seems', 'without', 'if', 'i', 'yourself', 'take', 'bottom', 'been', 'first', 'hers', 'ours', 'although', 'whom', 'be', 'myself', 'whatever', 'moreover', 'still', 'anyone', 'out', 'fifteen', 'seemed', 'nevertheless', 'within', 'either', 'thru', 'beyond', 'yet', 'everyone', 'ourselves', 'would', 'hereby', 'among', 'otherwise', 'becomes', 'formerly', 'four', 'becoming', 'nobody', 'thereby', 'to', 'else', 'empty', 'nor', 'etc', 'found', 'thereupon', 'f

318

In [6]:
stopwords_from_all_sources = pd.DataFrame(list(set(nltk_stopwords).union(set(sklearn_stopwords))), columns=['all_stopwords'])

stopwords_from_all_sources['sklearn'] = False
stopwords_from_all_sources.loc[stopwords_from_all_sources['all_stopwords'].isin(sklearn_stopwords),['sklearn']] = True

stopwords_from_all_sources['nltk'] = False
stopwords_from_all_sources.loc[stopwords_from_all_sources['all_stopwords'].isin(nltk_stopwords),['nltk']] = True

stopwords_from_all_sources

Unnamed: 0,all_stopwords,sklearn,nltk
0,last,True,False
1,show,True,False
2,you'll,False,True
3,put,True,False
4,did,False,True
...,...,...,...
373,him,True,True
374,however,True,False
375,us,True,False
376,fire,True,False


I should try to use both all stopwords, and sklearn and nltk stopwords singularly (and also no stopwords)

In [7]:
stopwords_all_list = list(stopwords_from_all_sources['all_stopwords'])

stopwords_all_NEG_list = [f'{i+"_neg"}' for i in stopwords_all_list]

def stopwords_list_gen(source_list = [sklearn_stopwords,nltk_stopwords], generate_neg = True):
    stopwords_all_list = set()
    for source in source_list:
        stopwords_all_list = stopwords_all_list.union(set(source))
        if generate_neg:
            stopwords_all_list = stopwords_all_list.union(set([f'{i+"_neg"}' for i in source]))
    return stopwords_all_list


### Preprocessing with stemming and lemmatization

Tokenization

In [8]:
df_dev['text_token'] = df_dev['text'].str.split()

Elimination of '@words'

In [9]:
df_dev['text_noAt'] = df_dev['text_token'].apply(lambda x : [i for i in x if not(i.startswith('@'))])

Elimination of repeated letters (incomplete)

In [10]:
a = 'ulaaa'

def remove_repeated_letters(word):
    for letter in word:
        print(letter)

# nltk_stemmer.stem(a)

Remove puntuation

In [11]:
import string as py_string
df_dev['text_noPunct'] = df_dev['text_noAt']\
    .apply(lambda x : [i.translate(str.maketrans('', '', py_string.punctuation)) for i in x])\
    .apply(lambda x : [i for i in x if i != ''])

Stemming 

In [12]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

# nltk_stemmer = PorterStemmer()
# nltk_stemmer = LancasterStemmer()
nltk_stemmer = SnowballStemmer('english')

df_dev['text_stemmed'] = df_dev['text_noPunct'].apply(lambda x : [nltk_stemmer.stem(word) for word in x])

Negation

In [13]:
from nltk.sentiment.util import mark_negation

df_dev['text_neg'] = df_dev['text_stemmed'].apply(lambda x : mark_negation(x))

## Model creation

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+').tokenize

df_dev_final = df_dev
# df_dev_final['text_final'] = df_dev['text_neg'].apply(lambda x : ' '.join(x))
df_dev_final['text_final'] = df_dev_final['text']

stopwords_to_use = [nltk_stemmer.stem(word) for word in stopwords_list_gen()]

vectorizer = TfidfVectorizer(
    # stop_words = stopwords_to_use,
    stop_words = 'english',
    # tokenizer = tokenizer,
    binary=True, 
    use_idf=True, 
    norm='l2',
    smooth_idf=True
)

wpm = vectorizer.fit_transform(df_dev_final['text_final'])

N = 200

word_freq = pd.Series(
    data = np.asarray(wpm.sum(axis=0)).squeeze(),
    index = vectorizer.get_feature_names_out()
).sort_values(ascending=False)

word_freq = word_freq[:N]

word_ind = [w in word_freq.index for w in vectorizer.get_feature_names_out()]

words_df = pd.DataFrame(
    data = wpm[:,word_ind].toarray(),
    columns = vectorizer.get_feature_names_out()[word_ind],
    index = df_dev.index
).add_prefix('word_')

# words_df
from sklearn.model_selection import train_test_split

X = words_df.values
y = df_dev['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, train_size=0.7, random_state=50)

In [64]:
words_df_analysis = words_df.join(df_dev['sentiment']).groupby(['sentiment']).sum().transpose()
words_df_analysis.columns
words_df_analysis['diff'] = (words_df_analysis.iloc[:,1] - words_df_analysis.iloc[:,0]) / (words_df_analysis.iloc[:,1] + words_df_analysis.iloc[:,0])

### RandomForestClassifier

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

rfc = RandomForestClassifier(
    n_estimators = 150, 
    random_state = 42, 
    min_impurity_decrease = 0.0,
    n_jobs=-3
)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

0.6561407503905465
              precision    recall  f1-score   support

           0       0.64      0.53      0.58     28451
           1       0.69      0.78      0.74     39048

    accuracy                           0.67     67499
   macro avg       0.67      0.65      0.66     67499
weighted avg       0.67      0.67      0.67     67499



### SVC

In [69]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

rfc = LinearSVC(random_state=42)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

0.6787789695814289
              precision    recall  f1-score   support

           0       0.71      0.51      0.59     28451
           1       0.70      0.85      0.77     39048

    accuracy                           0.70     67499
   macro avg       0.70      0.68      0.68     67499
weighted avg       0.70      0.70      0.69     67499



### GaussianNB

In [70]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

rfc = GaussianNB()

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

0.6525212956065252
              precision    recall  f1-score   support

           0       0.56      0.81      0.66     28451
           1       0.80      0.54      0.64     39048

    accuracy                           0.65     67499
   macro avg       0.68      0.67      0.65     67499
weighted avg       0.70      0.65      0.65     67499



### LogisticRegression

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

rfc = LogisticRegression(n_jobs=-4)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

0.6761852347914264
              precision    recall  f1-score   support

           0       0.71      0.50      0.58     28451
           1       0.70      0.85      0.77     39048

    accuracy                           0.70     67499
   macro avg       0.70      0.67      0.68     67499
weighted avg       0.70      0.70      0.69     67499



In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

params = {
    
}


### MLPClassifier

In [71]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

rfc = MLPClassifier(verbose = True)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

Iteration 1, loss = 0.56999814
Iteration 2, loss = 0.54440468
Iteration 3, loss = 0.54240847
Iteration 4, loss = 0.54047893
Iteration 5, loss = 0.53869067
Iteration 6, loss = 0.53682975
Iteration 7, loss = 0.53520252
Iteration 8, loss = 0.53349645
Iteration 9, loss = 0.53177883
Iteration 10, loss = 0.53044132
Iteration 11, loss = 0.52913975
Iteration 12, loss = 0.52780389
Iteration 13, loss = 0.52658603
Iteration 14, loss = 0.52547633
Iteration 15, loss = 0.52441353
Iteration 16, loss = 0.52312409
Iteration 17, loss = 0.52223343
Iteration 18, loss = 0.52109321
Iteration 19, loss = 0.52009236
Iteration 20, loss = 0.51897507
Iteration 21, loss = 0.51800636
Iteration 22, loss = 0.51728088
Iteration 23, loss = 0.51633078
Iteration 24, loss = 0.51520809
Iteration 25, loss = 0.51445030
Iteration 26, loss = 0.51337090
Iteration 27, loss = 0.51238160
Iteration 28, loss = 0.51141686
Iteration 29, loss = 0.51072541
Iteration 30, loss = 0.50988250
Iteration 31, loss = 0.50899344
Iteration 32, los



0.6693705135515255
              precision    recall  f1-score   support

           0       0.67      0.52      0.59     28451
           1       0.70      0.81      0.75     39048

    accuracy                           0.69     67499
   macro avg       0.68      0.67      0.67     67499
weighted avg       0.69      0.69      0.68     67499

