In [41]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')

In [43]:
df_dev['date'] = df_dev['date'].\
    astype('string')\
    .str.split(' ')\
    .apply(lambda x : ' '.join([x[i] for i in [1,2,3,5]]))\
    .pipe(pd.to_datetime)

### Stopwords analysis: sklearn VS nltk

#### nltk

In [44]:
from nltk import download as nltk_download

nltk_download('stopwords')

from nltk.corpus import stopwords

nltk_stopwords = stopwords.words('english')
print(stopwords.words('english'))
len(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

#### sklearn

In [45]:
from sklearn.feature_extraction import text

sklearn_stopwords = list(text.ENGLISH_STOP_WORDS)
print(list(text.ENGLISH_STOP_WORDS))
len(list(text.ENGLISH_STOP_WORDS))

['sometimes', 'thin', 'amongst', 'mostly', 'empty', 'namely', 'therefore', 'here', 'been', 'twenty', 'whither', 'whom', 'call', 'against', 'con', 'her', 'some', 'afterwards', 'indeed', 'wherever', 'below', 'whereafter', 'might', 'too', 'thus', 'both', 'after', 'ten', 'five', 'forty', 'hasnt', 'less', 'thereupon', 'whence', 'be', 'someone', 'also', 'eleven', 'mill', 'out', 'formerly', 'three', 'will', 'cry', 'which', 'again', 'very', 'when', 'do', 'himself', 'my', 'hereby', 'twelve', 'at', 'under', 'throughout', 'down', 'otherwise', 're', 'before', 'couldnt', 'beyond', 'amount', 'up', 'much', 'yourselves', 'towards', 'whether', 'thereafter', 'hereafter', 'that', 'perhaps', 'system', 'meanwhile', 'they', 'part', 'alone', 'for', 'him', 'me', 'put', 'nowhere', 'yourself', 'due', 'nor', 'ours', 'now', 'hereupon', 'fire', 'fill', 'show', 'next', 'further', 'bottom', 'toward', 'is', 'several', 'somehow', 'to', 'thereby', 'first', 'we', 'move', 'while', 'across', 'own', 'all', 'etc', 'un', 'am

318

In [46]:
stopwords_from_all_sources = pd.DataFrame(list(set(nltk_stopwords).union(set(sklearn_stopwords))), columns=['all_stopwords'])

stopwords_from_all_sources['sklearn'] = False
stopwords_from_all_sources.loc[stopwords_from_all_sources['all_stopwords'].isin(sklearn_stopwords),['sklearn']] = True

stopwords_from_all_sources['nltk'] = False
stopwords_from_all_sources.loc[stopwords_from_all_sources['all_stopwords'].isin(nltk_stopwords),['nltk']] = True

stopwords_from_all_sources

Unnamed: 0,all_stopwords,sklearn,nltk
0,sometimes,True,False
1,thin,True,False
2,two,True,False
3,many,True,False
4,its,True,True
...,...,...,...
373,herself,True,True
374,only,True,True
375,would,True,False
376,itself,True,True


I should try to use both all stopwords, and sklearn and nltk stopwords singularly (and also no stopwords)

In [47]:
stopwords_all_list = list(stopwords_from_all_sources['all_stopwords'])

stopwords_all_NEG_list = [f'{i+"_neg"}' for i in stopwords_all_list]

def stopwords_list_gen(source_list = [sklearn_stopwords,nltk_stopwords], generate_neg = True):
    stopwords_all_list = set()
    for source in source_list:
        stopwords_all_list = stopwords_all_list.union(set(source))
        if generate_neg:
            stopwords_all_list = stopwords_all_list.union(set([f'{i+"_neg"}' for i in source]))
    return stopwords_all_list

### Preprocessing with stemming and lemmatization

Tokenization

In [48]:
df_dev['text_token'] = df_dev['text'].str.split()

Removal of '@words'

In [49]:
df_dev['text_noAt'] = df_dev['text_token'].apply(lambda x : [i for i in x if not(i.startswith('@'))])

Removal of '&amp' and '&quot'

In [50]:
df_dev['text_noAmpQuot'] = df_dev['text_noAt']\
    .apply(lambda x : [i for i in x if '&amp' not in i])\
    .apply(lambda x : [i for i in x if '&quot' not in i])

Elimination of repeated letters (incomplete)

In [51]:
a = 'ulaaa'

def remove_repeated_letters(word):
    for letter in word:
        print(letter)

# nltk_stemmer.stem(a)

Remove puntuation

In [52]:
import string as py_string
df_dev['text_noPunct'] = df_dev['text_noAmpQuot']\
    .apply(lambda x : [i.translate(str.maketrans('', '', py_string.punctuation)) for i in x])\
    .apply(lambda x : [i for i in x if i != ''])

Stemming and lemmatization

In [103]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

nltk_download('wordnet')
nltk_download('omw-1.4')

nltk_stemmer = PorterStemmer()
# nltk_stemmer = LancasterStemmer()
# nltk_stemmer = SnowballStemmer('english')
nltk_lemmatizer = WordNetLemmatizer()
# from nltk.corpus import wordnet

df_dev['text_stemmed'] = df_dev['text_noPunct'].apply(lambda x : [nltk_stemmer.stem(word) for word in x])
# df_dev['text_stemmed'] = df_dev['text_noPunct'].apply(lambda x : [nltk_lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Negation

In [104]:
from nltk.sentiment.util import mark_negation

df_dev['text_neg'] = df_dev['text_stemmed'].apply(lambda x : mark_negation(x))

User manual filter

In [100]:
df_dev = df_dev.loc[
    ~(df_dev['user'] == 'lost_dog') &
    ~(df_dev['user'] == 'webwoke') &
    ~(df_dev['user'] == 'tweetpet') &
    ~(df_dev['user'].str.contains('tweeteradder')) &
    ~(df_dev['user'].str.contains('tweetfollow')) &
    ~(df_dev['user'] == 'divxdownloads')
]

## Model creation

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_dev_final = df_dev
df_dev_final['text_final'] = df_dev['text_neg'].apply(lambda x : ' '.join(x))
# df_dev_final['text_final'] = df_dev_final['text']

# stopwords_to_use = [nltk_stemmer.stem(word) for word in stopwords_list_gen()]
stopwords_to_use = [nltk_stemmer.stem(word) for word in stopwords_all_list]

vectorizer = TfidfVectorizer(
    # stop_words = stopwords_to_use,
    # stop_words = 'english',
    stop_words = None,
    binary=True, 
    use_idf=True, 
    norm='l2',
    smooth_idf=True
)

wpm = vectorizer.fit_transform(df_dev_final['text_final'])

N = 2000

word_freq = pd.Series(
    data = np.asarray(wpm.sum(axis=0)).squeeze(),
    index = vectorizer.get_feature_names_out()
).sort_values(ascending=False)

word_freq = word_freq[:N]

word_ind = [w in word_freq.index for w in vectorizer.get_feature_names_out()]

words_df = pd.DataFrame(
    data = wpm[:,word_ind].toarray(),
    columns = vectorizer.get_feature_names_out()[word_ind],
    index = df_dev.index
).add_prefix('word_')

# words_df
from sklearn.model_selection import train_test_split

X = words_df.values
y = df_dev['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, train_size=0.7, random_state=50)

In [73]:
words_df_analysis = words_df.join(df_dev['sentiment']).groupby(['sentiment']).sum().transpose()
words_df_analysis.columns
words_df_analysis['diff'] = (words_df_analysis.iloc[:,1] - words_df_analysis.iloc[:,0]) / (words_df_analysis.iloc[:,1] + words_df_analysis.iloc[:,0])

In [91]:
words_df_analysis.sort_values(by = 'diff', axis=0, ascending=False).iloc[470:499,:]

# words_df_analysis.loc[words_df_analysis.index.str.contains('movie')]

sentiment,0,1,diff
word_damn,329.246677,108.698845,-0.503596
word_cant_neg,141.002922,43.463834,-0.528762
word_bored,248.789705,76.521481,-0.529549
word_want_neg,300.128996,91.835226,-0.53141
word_feel_neg,176.485389,50.287525,-0.556494
word_stupid,190.261406,54.017513,-0.557739
word_lost,239.839159,67.327007,-0.561625
word_crap,144.567724,38.978176,-0.575276
word_bad,577.674582,150.989862,-0.585571
word_sorry,908.412752,235.021268,-0.58892


In [90]:
df_dev_final.loc[df_dev_final['text_final'].str.contains(' wait '),:]

Unnamed: 0,sentiment,ids,date,flag,user,text,text_token,text_noAt,text_noAmpQuot,text_noPunct,text_stemmed,text_neg,text_final
195,0,2069225157,2009-06-07 15:29:23,NO_QUERY,icysun23,@SackPackies Yes @ohflawless I know I think...,"[@SackPackies, Yes, @ohflawless, I, know, I, t...","[Yes, I, know, I, think, I'll, just, wait, for...","[Yes, I, know, I, think, I'll, just, wait, for...","[Yes, I, know, I, think, Ill, just, wait, for,...","[Yes, I, know, I, think, Ill, just, wait, for,...","[Yes, I, know, I, think, Ill, just, wait, for,...",Yes I know I think Ill just wait for tomorrow ...
323,1,2064066330,2009-06-07 05:06:26,NO_QUERY,misharae74,@dave_annable B&amp;S is in my NetFlix queue &...,"[@dave_annable, B&amp;S, is, in, my, NetFlix, ...","[B&amp;S, is, in, my, NetFlix, queue, &amp;, I...","[is, in, my, NetFlix, queue, I, should, receiv...","[is, in, my, NetFlix, queue, I, should, receiv...","[is, in, my, NetFlix, queue, I, should, receiv...","[is, in, my, NetFlix, queue, I, should, receiv...",is in my NetFlix queue I should receive the fi...
470,0,2251729570,2009-06-20 04:24:29,NO_QUERY,roxiijonas,I wanna watch PPP so bad! But my dads watching...,"[I, wanna, watch, PPP, so, bad!, But, my, dads...","[I, wanna, watch, PPP, so, bad!, But, my, dads...","[I, wanna, watch, PPP, so, bad!, But, my, dads...","[I, wanna, watch, PPP, so, bad, But, my, dads,...","[I, wanna, watch, PPP, so, bad, But, my, dad, ...","[I, wanna, watch, PPP, so, bad, But, my, dad, ...",I wanna watch PPP so bad But my dad watching t...
532,0,2051330834,2009-06-05 21:41:00,NO_QUERY,Xboxking,@CrazyEcho YAH... ohh wait... We'll miss you C...,"[@CrazyEcho, YAH..., ohh, wait..., We'll, miss...","[YAH..., ohh, wait..., We'll, miss, you, Chuck...","[YAH..., ohh, wait..., We'll, miss, you, Chuck...","[YAH, ohh, wait, Well, miss, you, Chuck, lets,...","[YAH, ohh, wait, Well, miss, you, Chuck, let, ...","[YAH, ohh, wait, Well, miss, you, Chuck, let, ...",YAH ohh wait Well miss you Chuck let take a mo...
536,1,2183744916,2009-06-15 14:55:43,NO_QUERY,Mum_of_Six,@KP_eezy I'll definitely keep you posted. Hop...,"[@KP_eezy, I'll, definitely, keep, you, posted...","[I'll, definitely, keep, you, posted., Hoping,...","[I'll, definitely, keep, you, posted., Hoping,...","[Ill, definitely, keep, you, posted, Hoping, t...","[Ill, definitely, keep, you, posted, Hoping, t...","[Ill, definitely, keep, you, posted, Hoping, t...",Ill definitely keep you posted Hoping to go ov...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223943,0,2004066058,2009-06-02 07:36:39,NO_QUERY,FaithfulChosen,@jubean true But wait till she becomes his se...,"[@jubean, true, But, wait, till, she, becomes,...","[true, But, wait, till, she, becomes, his, sec...","[true, But, wait, till, she, becomes, his, sec...","[true, But, wait, till, she, becomes, his, sec...","[true, But, wait, till, she, becomes, his, sec...","[true, But, wait, till, she, becomes, his, sec...",true But wait till she becomes his second mommy
224019,1,2002619508,2009-06-02 04:43:38,NO_QUERY,its4am,@durian_girl lol you have to let the little on...,"[@durian_girl, lol, you, have, to, let, the, l...","[lol, you, have, to, let, the, little, one, sl...","[lol, you, have, to, let, the, little, one, sl...","[lol, you, have, to, let, the, little, one, sl...","[lol, you, have, to, let, the, little, one, sl...","[lol, you, have, to, let, the, little, one, sl...",lol you have to let the little one sleep You h...
224764,1,1984915911,2009-05-31 16:03:41,NO_QUERY,Jennifer_x_,"Today was soooo good ! M&amp;Ds was aweeesome,...","[Today, was, soooo, good, !, M&amp;Ds, was, aw...","[Today, was, soooo, good, !, M&amp;Ds, was, aw...","[Today, was, soooo, good, !, was, aweeesome,, ...","[Today, was, soooo, good, was, aweeesome, BBQ,...","[Today, wa, soooo, good, wa, aweeesome, BBQ, w...","[Today, wa, soooo, good, wa, aweeesome, BBQ, w...",Today wa soooo good wa aweeesome BBQ wa so yum...
224767,0,2201725794,2009-06-16 20:41:04,NO_QUERY,xPurplexMuffinx,Just woke up... School today Can't wait til i...,"[Just, woke, up..., School, today, Can't, wait...","[Just, woke, up..., School, today, Can't, wait...","[Just, woke, up..., School, today, Can't, wait...","[Just, woke, up, School, today, Cant, wait, ti...","[Just, woke, up, School, today, Cant, wait, ti...","[Just, woke, up, School, today, Cant, wait, ti...",Just woke up School today Cant wait til it over


In [23]:
# word = 'vip'
# print(df_dev.loc[df_dev['text'].str.contains(word),:]['sentiment'].value_counts())
# print(df_dev.loc[df_dev['text'].str.contains(word),:]['user'].value_counts())
# print(df_dev.loc[df_dev['text'].str.contains(word),:]['user'].value_counts().sum())

### RandomForestClassifier

In [218]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

rfc = RandomForestClassifier(
    n_estimators = 150, 
    random_state = 42, 
    min_impurity_decrease = 0.0,
    n_jobs=-6
)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

0.3664453306305709
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     28199
           1       0.58      1.00      0.73     38686

    accuracy                           0.58     66885
   macro avg       0.29      0.50      0.37     66885
weighted avg       0.33      0.58      0.42     66885



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### LinearSVC

In [137]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = LinearSVC(random_state=70)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

0.7679078743418951
              precision    recall  f1-score   support

           0       0.76      0.68      0.72     28199
           1       0.79      0.85      0.81     38686

    accuracy                           0.78     66885
   macro avg       0.77      0.76      0.77     66885
weighted avg       0.78      0.78      0.78     66885

[[19266  8933]
 [ 5962 32724]]


### GaussianNB

In [252]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = GaussianNB()

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

0.7023192652722203
              precision    recall  f1-score   support

           0       0.66      0.65      0.65     28199
           1       0.75      0.76      0.75     38686

    accuracy                           0.71     66885
   macro avg       0.70      0.70      0.70     66885
weighted avg       0.71      0.71      0.71     66885

[[18285  9914]
 [ 9464 29222]]


### LogisticRegression

In [253]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = LogisticRegression(n_jobs=-4)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

0.7335232472069035
              precision    recall  f1-score   support

           0       0.73      0.63      0.68     28199
           1       0.75      0.83      0.79     38686

    accuracy                           0.75     66885
   macro avg       0.74      0.73      0.73     66885
weighted avg       0.75      0.75      0.74     66885

[[17625 10574]
 [ 6392 32294]]


In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

params = {
    'penalty' : ['l1','l2','elasticnet','none'],
    'tol' : [1e-3,1e-4,1e-5],
    'solver' : ['newton-cg','lbfgs','liblinear','sag','saga']
}

# gsCV = GridSearchCV()

### MLPClassifier

In [71]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, classification_report

rfc = MLPClassifier(verbose = True)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))

print(f1)
print(report)

Iteration 1, loss = 0.56999814
Iteration 2, loss = 0.54440468
Iteration 3, loss = 0.54240847
Iteration 4, loss = 0.54047893
Iteration 5, loss = 0.53869067
Iteration 6, loss = 0.53682975
Iteration 7, loss = 0.53520252
Iteration 8, loss = 0.53349645
Iteration 9, loss = 0.53177883
Iteration 10, loss = 0.53044132
Iteration 11, loss = 0.52913975
Iteration 12, loss = 0.52780389
Iteration 13, loss = 0.52658603
Iteration 14, loss = 0.52547633
Iteration 15, loss = 0.52441353
Iteration 16, loss = 0.52312409
Iteration 17, loss = 0.52223343
Iteration 18, loss = 0.52109321
Iteration 19, loss = 0.52009236
Iteration 20, loss = 0.51897507
Iteration 21, loss = 0.51800636
Iteration 22, loss = 0.51728088
Iteration 23, loss = 0.51633078
Iteration 24, loss = 0.51520809
Iteration 25, loss = 0.51445030
Iteration 26, loss = 0.51337090
Iteration 27, loss = 0.51238160
Iteration 28, loss = 0.51141686
Iteration 29, loss = 0.51072541
Iteration 30, loss = 0.50988250
Iteration 31, loss = 0.50899344
Iteration 32, los



0.6693705135515255
              precision    recall  f1-score   support

           0       0.67      0.52      0.59     28451
           1       0.70      0.81      0.75     39048

    accuracy                           0.69     67499
   macro avg       0.68      0.67      0.67     67499
weighted avg       0.69      0.69      0.68     67499



### SGDClassifier

In [254]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = SGDClassifier(n_jobs=-5, random_state=50)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

0.7041128433588708
              precision    recall  f1-score   support

           0       0.77      0.52      0.62     28199
           1       0.71      0.89      0.79     38686

    accuracy                           0.73     66885
   macro avg       0.74      0.70      0.70     66885
weighted avg       0.74      0.73      0.72     66885

[[14526 13673]
 [ 4388 34298]]


### SVC

In [255]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

rfc = SVC(random_state=50)

rfc.fit(X_train, y_train)

f1 = f1_score(y_test, rfc.predict(X_test),average='macro')
report = classification_report(y_test, rfc.predict(X_test))
confusion = confusion_matrix(y_test, rfc.predict(X_test))

print(f1)
print(report)
print(confusion)

## Submission

In [130]:
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')

In [134]:
df_eval['text_token'] = df_eval['text'].str.split()

df_eval['text_noAt'] = df_eval['text_token'].apply(lambda x : [i for i in x if not(i.startswith('@'))])

df_eval['text_noAmpQuot'] = df_eval['text_noAt']\
    .apply(lambda x : [i for i in x if '&amp' not in i])\
    .apply(lambda x : [i for i in x if '&quot' not in i])

import string as py_string
df_eval['text_noPunct'] = df_eval['text_noAmpQuot']\
    .apply(lambda x : [i.translate(str.maketrans('', '', py_string.punctuation)) for i in x])\
    .apply(lambda x : [i for i in x if i != ''])

from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

nltk_download('wordnet')
nltk_download('omw-1.4')

nltk_stemmer = PorterStemmer()
# nltk_stemmer = LancasterStemmer()
# nltk_stemmer = SnowballStemmer('english')
nltk_lemmatizer = WordNetLemmatizer()
# from nltk.corpus import wordnet

df_eval['text_stemmed'] = df_eval['text_noPunct'].apply(lambda x : [nltk_stemmer.stem(word) for word in x])
# df_eval['text_stemmed'] = df_eval['text_noPunct'].apply(lambda x : [nltk_lemmatizer.lemmatize(word) for word in x])

from nltk.sentiment.util import mark_negation

df_eval['text_neg'] = df_eval['text_stemmed'].apply(lambda x : mark_negation(x))

## Model creation
from sklearn.feature_extraction.text import TfidfVectorizer

df_eval_final = df_eval
df_eval_final['text_final'] = df_eval['text_neg'].apply(lambda x : ' '.join(x))
# df_eval_final['text_final'] = df_eval_final['text']

# stopwords_to_use = [nltk_stemmer.stem(word) for word in stopwords_list_gen()]
stopwords_to_use = [nltk_stemmer.stem(word) for word in stopwords_all_list]

vectorizer = TfidfVectorizer(
    # stop_words = stopwords_to_use,
    # stop_words = 'english',
    stop_words = None,
    binary=True, 
    use_idf=True, 
    norm='l2',
    smooth_idf=True
)

wpm = vectorizer.fit_transform(df_eval_final['text_final'])

N = 2000

word_freq = pd.Series(
    data = np.asarray(wpm.sum(axis=0)).squeeze(),
    index = vectorizer.get_feature_names_out()
).sort_values(ascending=False)

word_freq = word_freq[:N]

word_ind = [w in word_freq.index for w in vectorizer.get_feature_names_out()]

words_df = pd.DataFrame(
    data = wpm[:,word_ind].toarray(),
    columns = vectorizer.get_feature_names_out()[word_ind],
    index = df_eval.index
).add_prefix('word_')

X = words_df.values


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\edo_c\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [135]:
print(df_eval.shape)
print(X.shape)

(74999, 12)
(74999, 2000)


In [136]:
from sklearn.svm import LinearSVC

rfc = LinearSVC(random_state=50)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X)


In [127]:
pd.DataFrame(y_pred,columns=['Predicted']).to_csv("output.csv",index_label="Id", header=["Predicted"])