In [29]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')

In [31]:
df_eval.shape

(74999, 5)

In [32]:
df = df_dev.append(df_eval)

In [33]:
# df['date'] = df['date'].\
#     astype('string')\
#     .str.split(' ')\
#     .apply(lambda x : ' '.join([x[i] for i in [1,2,3,5]]))\
#     .pipe(pd.to_datetime)

### Stopwords analysis: sklearn, nltk, stop_words

#### nltk

In [34]:
from nltk import download as nltk_download

nltk_download('stopwords')

from nltk.corpus import stopwords

nltk_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### sklearn

In [35]:
from sklearn.feature_extraction import text

sklearn_stopwords = list(text.ENGLISH_STOP_WORDS)

#### stop_words

In [36]:
from stop_words import get_stop_words

stop_words_stopwords = get_stop_words('english')

I should try to use both all stopwords, and sklearn and nltk stopwords singularly (and also no stopwords)

In [37]:
def stopwords_list_gen(source_list = [sklearn_stopwords,nltk_stopwords], generate_neg = True):
    stopwords_all_list = set()
    for source in source_list:
        stopwords_all_list = stopwords_all_list.union(set(source))
        if generate_neg:
            stopwords_all_list = stopwords_all_list.union(set([f'{i+"_neg"}' for i in source]))
    return stopwords_all_list

## Preprocessing

Tokenization

In [38]:
df['text_token'] = df['text'].str.split()

Removal of '@words'

In [39]:
df['text_noAt'] = df['text_token'].apply(lambda x : [i for i in x if not(i.startswith('@'))])

Removal of '&amp' and '&quot'

In [40]:
df['text_noAmpQuot'] = df['text_noAt']\
    .apply(lambda x : [i for i in x if '&amp' not in i])\
    .apply(lambda x : [i for i in x if '&quot' not in i])

Removal of repeated letters (incomplete)

In [41]:
a = 'ulaaa'

def remove_repeated_letters(word):
    for letter in word:
        print(letter)

# nltk_stemmer.stem(a)

Remove puntuation

In [42]:
import string as py_string
df['text_noPunct'] = df['text_noAmpQuot']\
    .apply(lambda x : [i.translate(str.maketrans('', '', py_string.punctuation)) for i in x])\
    .apply(lambda x : [i for i in x if i != ''])

Negation

In [43]:
from nltk.sentiment.util import mark_negation

df['text_neg'] = df['text_noPunct'].apply(lambda x : mark_negation(x))

Removal of stopwords

In [44]:
considered_stopwords = stop_words_stopwords + [f'{word}_neg' for word in stop_words_stopwords]

df['text_noStopwords'] = df['text_neg'].apply(lambda x : [i for i in x if i not in considered_stopwords])

Stemming and lemmatization

In [45]:
# from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

# nltk_download('wordnet')
# nltk_download('omw-1.4')

# nltk_stemmer = PorterStemmer()
# # nltk_stemmer = LancasterStemmer()
# # nltk_stemmer = SnowballStemmer('english')
# nltk_lemmatizer = WordNetLemmatizer()
# # from nltk.corpus import wordnet

# df['text_stemmed'] = df['text_noPunct'].apply(lambda x : [nltk_stemmer.stem(word) for word in x])
# df['text_stemmed'] = df['text_noPunct'].apply(lambda x : [nltk_lemmatizer.lemmatize(word) for word in x])

User manual filter

In [46]:
df = df.loc[
    (~(df['user'] == 'lost_dog') &
    ~(df['user'] == 'webwoke') &
    ~(df['user'] == 'tweetpet') &
    ~(df['user'].str.contains('tweeteradder')) &
    ~(df['user'].str.contains('tweetfollow')) &
    ~(df['user'] == 'divxdownloads')) |
    df['sentiment'].isna()
]

In [47]:
df[df['sentiment'].isna()].shape

(74999, 12)

## Model creation

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_final = df
df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))
# df_final['text_final'] = df_final['text']

# stopwords_to_use = [nltk_stemmer.stem(word) for word in stopwords_list_gen()]
# stopwords_to_use = [nltk_stemmer.stem(word) for word in sklearn_stopwords]

vectorizer = TfidfVectorizer(
    # stop_words = stopwords_to_use,
    # stop_words = 'english',
    stop_words = None,
    binary=True, 
    use_idf=True, 
    norm='l2',
    smooth_idf=True
)

wpm = vectorizer.fit_transform(df_final['text_final'])

### Processing with truncatedSVD

In [70]:
N = 2300

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=N, n_iter=10, random_state=42)

wpm_svd = svd.fit_transform(wpm)

print(wpm_svd.shape)

features_df = pd.DataFrame(
    data = wpm_svd,
    index = df_final.index
)

from sklearn.model_selection import train_test_split

mask_train_test = df_final['sentiment'].notna()

X_train_valid = features_df.loc[mask_train_test,:].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = features_df.loc[~mask_train_test,:].values

print(X_train_valid.shape)
print(y_train_valid.shape)
print(X_test.shape)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.8, 
    random_state=42
)

(297949, 2300)
(222950, 2300)
(222950,)
(74999, 2300)


### Processing without truncatedSVD

In [24]:
N = 500

word_freq = pd.Series(
    data = np.asarray(wpm.sum(axis=0)).squeeze(),
    index = vectorizer.get_feature_names_out()
).sort_values(ascending=False)

word_freq = word_freq[:N]

word_ind = [w in word_freq.index for w in vectorizer.get_feature_names_out()]

words_df = pd.DataFrame(
    data = wpm[:,word_ind].toarray(),
    columns = vectorizer.get_feature_names_out()[word_ind],
    index = df_final.index
).add_prefix('word_')

# words_df
from sklearn.model_selection import train_test_split

mask_train_test = df_final['sentiment'].notna()

X_train_valid = words_df.loc[mask_train_test,:].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = words_df.loc[~mask_train_test,:].values

print(X_train_valid.shape)
print(y_train_valid.shape)
print(X_test.shape)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.8, 
    random_state=50
)

(222950, 500)
(222950,)
(74999, 500)


## Models

### RandomForestClassifier

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = RandomForestClassifier(
    n_estimators = 150, 
    random_state = 42, 
    min_impurity_decrease = 0.0,
    n_jobs=-6
)

rfc.fit(X_train, y_train)

f1 = f1_score(y_valid, rfc.predict(X_valid),average='macro')
report = classification_report(y_valid, rfc.predict(X_valid))
confusion = confusion_matrix(y_valid, rfc.predict(X_valid))

print(f1)
print(report)
print(confusion)

0.7029172012424388
              precision    recall  f1-score   support

         0.0       0.70      0.58      0.64     18800
         1.0       0.73      0.81      0.77     25790

    accuracy                           0.72     44590
   macro avg       0.71      0.70      0.70     44590
weighted avg       0.72      0.72      0.71     44590

[[10996  7804]
 [ 4774 21016]]


In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [50,100,200],
    'criterion' : ['gini','entropy'],
    'max_features' : ['auto','sqrt','log2'],
    'class_weight' : ['balanced', 'balanced_subsample', None],
    'random_state' : [42],
    'oob_score' : [True,False]
}

# params = {
#     'class_weight' : ['balanced'], 
#     'dual' : [False], 
#     'max_iter' : [500], 
#     'penalty' : ['l1'], 
#     'random_state' : [42]
# }

rfc = RandomForestClassifier()

gsCV = GridSearchCV(
    estimator = rfc,
    param_grid = params,
    scoring = 'f1_macro',
    n_jobs = -4,
    verbose = 3
)

gsCV.fit(X_train,y_train)

f1 = f1_score(y_valid, gsCV.predict(X_valid),average='macro')
report = classification_report(y_valid, gsCV.predict(X_valid))
confusion = confusion_matrix(y_valid, gsCV.predict(X_valid))

print(f1)
print(report)
print(confusion)

### LinearSVC

In [71]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = LinearSVC(C=10, class_weight='balanced', dual=False, max_iter=100, random_state=42, tol=0.001)
rfc.fit(X_train, y_train)

f1 = f1_score(y_valid, rfc.predict(X_valid),average='macro')
report = classification_report(y_valid, rfc.predict(X_valid))
confusion = confusion_matrix(y_valid, rfc.predict(X_valid))

print(f1)
print(report)
print(confusion)

0.7738584559325508
              precision    recall  f1-score   support

         0.0       0.73      0.76      0.74     18800
         1.0       0.82      0.79      0.80     25790

    accuracy                           0.78     44590
   macro avg       0.77      0.78      0.77     44590
weighted avg       0.78      0.78      0.78     44590

[[14298  4502]
 [ 5393 20397]]


In [67]:
from sklearn.model_selection import GridSearchCV

params = {
    'penalty' : ['l1','l2'],
    'dual' : [False], # [True, False],
    'tol' : [1e-3,1e-4,1e-5],
    'fit_intercept' : [True, False],
    'class_weight' : ['balanced', None],
    # 'verbose' : [1],
    'max_iter' : [100,200,500,1000,2000],
    'random_state' : [42],
    'C' : [1,5,10,50,100,500,1000]
}

# params = {
#     'class_weight' : ['balanced'], 
#     'dual' : [False], 
#     'max_iter' : [500], 
#     'penalty' : ['l1'], 
#     'random_state' : [42]
# }

rfc = LinearSVC()

gsCV = GridSearchCV(
    estimator = rfc,
    param_grid = params,
    scoring = 'f1_macro',
    n_jobs = -4,
    verbose = 3
)

gsCV.fit(X_train,y_train)

f1 = f1_score(y_valid, gsCV.predict(X_valid),average='macro')
report = classification_report(y_valid, gsCV.predict(X_valid))
confusion = confusion_matrix(y_valid, gsCV.predict(X_valid))

print(f1)
print(report)
print(confusion)

Fitting 5 folds for each of 840 candidates, totalling 4200 fits
0.7177860909354472
              precision    recall  f1-score   support

         0.0       0.66      0.71      0.68     18800
         1.0       0.78      0.73      0.75     25790

    accuracy                           0.72     44590
   macro avg       0.72      0.72      0.72     44590
weighted avg       0.73      0.72      0.72     44590

[[13441  5359]
 [ 7047 18743]]


In [68]:
gsCV.best_estimator_

LinearSVC(C=10, class_weight='balanced', dual=False, max_iter=100,
          random_state=42, tol=0.001)

### GaussianNB

In [252]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = GaussianNB()

rfc.fit(X_train, y_train)

f1 = f1_score(y_valid, rfc.predict(X_valid),average='macro')
report = classification_report(y_valid, rfc.predict(X_valid))
confusion = confusion_matrix(y_valid, rfc.predict(X_valid))

print(f1)
print(report)
print(confusion)

0.7023192652722203
              precision    recall  f1-score   support

           0       0.66      0.65      0.65     28199
           1       0.75      0.76      0.75     38686

    accuracy                           0.71     66885
   macro avg       0.70      0.70      0.70     66885
weighted avg       0.71      0.71      0.71     66885

[[18285  9914]
 [ 9464 29222]]


### LogisticRegression

In [253]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = LogisticRegression(n_jobs=-4)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

0.7335232472069035
              precision    recall  f1-score   support

           0       0.73      0.63      0.68     28199
           1       0.75      0.83      0.79     38686

    accuracy                           0.75     66885
   macro avg       0.74      0.73      0.73     66885
weighted avg       0.75      0.75      0.74     66885

[[17625 10574]
 [ 6392 32294]]


### MLPClassifier

In [71]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, classification_report

rfc = MLPClassifier(verbose = True)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

Iteration 1, loss = 0.56999814
Iteration 2, loss = 0.54440468
Iteration 3, loss = 0.54240847
Iteration 4, loss = 0.54047893
Iteration 5, loss = 0.53869067
Iteration 6, loss = 0.53682975
Iteration 7, loss = 0.53520252
Iteration 8, loss = 0.53349645
Iteration 9, loss = 0.53177883
Iteration 10, loss = 0.53044132
Iteration 11, loss = 0.52913975
Iteration 12, loss = 0.52780389
Iteration 13, loss = 0.52658603
Iteration 14, loss = 0.52547633
Iteration 15, loss = 0.52441353
Iteration 16, loss = 0.52312409
Iteration 17, loss = 0.52223343
Iteration 18, loss = 0.52109321
Iteration 19, loss = 0.52009236
Iteration 20, loss = 0.51897507
Iteration 21, loss = 0.51800636
Iteration 22, loss = 0.51728088
Iteration 23, loss = 0.51633078
Iteration 24, loss = 0.51520809
Iteration 25, loss = 0.51445030
Iteration 26, loss = 0.51337090
Iteration 27, loss = 0.51238160
Iteration 28, loss = 0.51141686
Iteration 29, loss = 0.51072541
Iteration 30, loss = 0.50988250
Iteration 31, loss = 0.50899344
Iteration 32, los



0.6693705135515255
              precision    recall  f1-score   support

           0       0.67      0.52      0.59     28451
           1       0.70      0.81      0.75     39048

    accuracy                           0.69     67499
   macro avg       0.68      0.67      0.67     67499
weighted avg       0.69      0.69      0.68     67499



### SGDClassifier

In [254]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = SGDClassifier(n_jobs=-5, random_state=50)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

0.7041128433588708
              precision    recall  f1-score   support

           0       0.77      0.52      0.62     28199
           1       0.71      0.89      0.79     38686

    accuracy                           0.73     66885
   macro avg       0.74      0.70      0.70     66885
weighted avg       0.74      0.73      0.72     66885

[[14526 13673]
 [ 4388 34298]]


### SVC

In [255]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = SVC(random_state=50)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

## Submission

In [72]:
from sklearn.svm import LinearSVC

# rfc = LinearSVC(random_state=70)

# rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)


In [73]:
pd.DataFrame(y_pred.astype(int),columns=['Predicted']).to_csv("output2.csv",index_label="Id", header=["Predicted"])