In [3]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import cupy 
import cudf

In [5]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')

In [6]:
df_eval.shape

(74999, 5)

In [7]:
df = df_dev.append(df_eval)

In [8]:
# df['date'] = df['date'].\
#     astype('string')\
#     .str.split(' ')\
#     .apply(lambda x : ' '.join([x[i] for i in [1,2,3,5]]))\
#     .pipe(pd.to_datetime)

### Stopwords analysis: sklearn, nltk, stop_words

#### nltk

In [9]:
from nltk import download as nltk_download

nltk_download('stopwords')

from nltk.corpus import stopwords

nltk_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/edoch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### sklearn

In [10]:
from sklearn.feature_extraction import text

sklearn_stopwords = list(text.ENGLISH_STOP_WORDS)

#### stop_words

In [11]:
# from stop_words import get_stop_words

# stop_words_stopwords = get_stop_words('english')

I should try to use both all stopwords, and sklearn and nltk stopwords singularly (and also no stopwords)

In [12]:
def stopwords_list_gen(source_list = [sklearn_stopwords,nltk_stopwords], generate_neg = True):
    stopwords_all_list = set()
    for source in source_list:
        stopwords_all_list = stopwords_all_list.union(set(source))
        if generate_neg:
            stopwords_all_list = stopwords_all_list.union(set([f'{i+"_neg"}' for i in source]))
    return stopwords_all_list

## Preprocessing

Tokenization

In [13]:
df['text_token'] = df['text'].str.split()

Removal of '@words', '&words', 'http:words'

In [14]:
df['text_noAt'] = df['text_token'].apply(lambda x : [i for i in x if not(i.startswith('@'))])\
    .apply(lambda x : [i for i in x if not(i.startswith('&'))])\
    .apply(lambda x : [i for i in x if not(i.startswith('http:'))])

Removal of '&amp', '&quot'

In [15]:
df['text_noAmpQuot'] = df['text_noAt']\
    .apply(lambda x : [i for i in x if '&amp' not in i])\
    .apply(lambda x : [i for i in x if '&quot' not in i])

Removal of repeated letters (incomplete)

In [16]:
a = 'ulaaa'

def remove_repeated_letters(word):
    for letter in word:
        print(letter)

# nltk_stemmer.stem(a)

Remove puntuation

In [17]:
import string as py_string
df['text_noPunct'] = df['text_noAmpQuot']\
    .apply(lambda x : [i.translate(str.maketrans('', '', py_string.punctuation)) for i in x])\
    .apply(lambda x : [i for i in x if i != ''])

Negation

In [18]:
from nltk.sentiment.util import mark_negation

df['text_neg'] = df['text_noPunct'].apply(lambda x : mark_negation(x))

Removal of stopwords

In [19]:
# considered_stopwords = stop_words_stopwords + [f'{word}_neg' for word in stop_words_stopwords]

# df['text_noStopwords'] = df['text_neg'].apply(lambda x : [i for i in x if i not in considered_stopwords])

Stemming and lemmatization

In [20]:
# from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

# nltk_download('wordnet')
# nltk_download('omw-1.4')

# nltk_stemmer = PorterStemmer()
# # nltk_stemmer = LancasterStemmer()
# # nltk_stemmer = SnowballStemmer('english')
# nltk_lemmatizer = WordNetLemmatizer()
# # from nltk.corpus import wordnet

# df['text_stemmed'] = df['text_noPunct'].apply(lambda x : [nltk_stemmer.stem(word) for word in x])
# df['text_stemmed'] = df['text_noPunct'].apply(lambda x : [nltk_lemmatizer.lemmatize(word) for word in x])

User manual filter

In [21]:
df = df.loc[
    (~(df['user'] == 'lost_dog') &
    ~(df['user'] == 'webwoke') &
    ~(df['user'] == 'tweetpet') &
    ~(df['user'].str.contains('tweeteradder')) &
    ~(df['user'].str.contains('tweetfollow')) &
    ~(df['user'] == 'divxdownloads')) |
    df['sentiment'].isna()
]

In [22]:
df[df['sentiment'].isna()].shape

(74999, 11)

## Model creation

In [40]:
from sklearn.model_selection import ParameterGrid

linearSVC_params = {
    'penalty' : ['l1'], #['l1','l2']
    'dual' : [False],
    'tol' : [1e-3,1e-4],
    'fit_intercept' : [False],
    'class_weight' : ['balanced'],
    'max_iter' : [100,300],
    'random_state' : [42],
    'C' : [1,10,50]
}

TfidfVectorizer_params = {
    'stop_words' : [None],
    'ngram_range' : [(1,1)],
    'max_features' : [None, 15000, 30000],
    'max_df' : [1.0,0.1,0.005],
    'min_df' : [1,0.0001,0.00001],
    'binary' : [True,False],
    'norm' : ['l1','l2'],
    'use_idf' : [True],
    'smooth_idf' : [True,False],
    'sublinear_tf' : [False]
}

In [38]:
print(len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params))))

print(len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))*13/60/60/24)

12
0.0018055555555555557


# Train test split

In [41]:
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from datetime import datetime
from pathlib import Path

logs_path = Path.cwd()/'logs'
results_path = Path.cwd()/'results'
logs_path.mkdir(exist_ok=True)
results_path.mkdir(exist_ok=True)
log_file_name = logs_path/f'log_out_{datetime.now()}.txt'
results_file_name = results_path/f'results_out_{datetime.now()}.txt'
with open(log_file_name, 'w') as file_log:
    file_log.write('File output\n\n')
with open(results_file_name, 'w') as file_result:
    file_result.write('configuration_number,vectorizer_param,model_param,f1\n')

df_final = df
# df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))
df_final['text_final'] = df_final['text_neg'].apply(lambda x : ' '.join(x))

mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.8, 
    random_state=50
)

configuration_number = 0
print(f'Number of combinations:\t{len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))}')
for vectorizer_param in ParameterGrid(TfidfVectorizer_params):
    for model_param in ParameterGrid(linearSVC_params):
        configuration_number += 1
        print(configuration_number, vectorizer_param, model_param)

        pipe = Pipeline([
            ('tfidf', TfidfVectorizer(**vectorizer_param)),
            ('linearSVC', LinearSVC(**model_param))
        ])

        pipe.fit(X_train, y_train)

        f1 = f1_score(y_valid, pipe.predict(X_valid),average='macro')
        report = classification_report(y_valid, pipe.predict(X_valid))
        # confusion = confusion_matrix(y_valid, pipe.predict(X_valid))

        with open(log_file_name, 'a') as file_log:
            file_log.write(f'configuration_number: {configuration_number}\n')
            file_log.write(f'vectorizer_param: \t{vectorizer_param}\n')
            file_log.write(f'model_param: \t\t{model_param}\n')
            file_log.write(f'f1_score:\t\t\t{f1}\n\n')
            file_log.write(f'{report}\n\n')
            file_log.write(f'{"*"*150}\n\n')
        with open(results_file_name, 'a') as file_result:
            file_result.write(f'{configuration_number},{vectorizer_param},{model_param},{f1}\n')

Number of combinations:	12
1 {'max_features': None, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
2 {'max_features': 10000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
3 {'max_features': 11000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
4 {'max_features': 12000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
5 {'max_features': 13000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
6 {'max_features': 14000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
7 {'max_features': 15000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
8 {'max_features': 16000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
9 {'max_features': 17000, 'ngram_range': (1, 1), 'stop_words': None} {'dual': False, 'penalty': 'l1'}
10 {'max_features': 18000, 'ngram_range': (1, 1), 'stop_

# Grid search

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

df_final = df
# df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))
df_final['text_final'] = df_final['text_neg'].apply(lambda x : ' '.join(x))

mask_train_test = df_final['sentiment'].notna()

X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
X_test = df_final.loc[~mask_train_test,:]['text_final'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, 
    y_train_valid, 
    shuffle=True, 
    # stratify=y_train_valid, 
    train_size=0.8, 
    random_state=50
)

vectorizer = 'tfidf'
model = 'linearSVC'

pipe = Pipeline([
    (vectorizer, TfidfVectorizer()),
    (model, LinearSVC())
])

def params_for_GridSearchCV(params_IN,step_name):
    return {f'{step_name}__{key}':value for (key,value) in params_IN.items()}

params_all = params_for_GridSearchCV(TfidfVectorizer_params,vectorizer)
params_all.update(params_for_GridSearchCV(linearSVC_params,model))
print(params_all)

gscv = GridSearchCV(pipe, params_all, cv = 3, verbose = 4, scoring='f1_macro')

gscv.fit(X_train, y_train)

f1 = f1_score(y_valid, gscv.predict(X_valid),average='macro')
report = classification_report(y_valid, gscv.predict(X_valid))
confusion = confusion_matrix(y_valid, gscv.predict(X_valid))

print(f1)
print(report)
print(confusion)
# configuration_number = 0
# print(f'Number of combinations:\t{len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))}')
# for vectorizer_param in ParameterGrid(TfidfVectorizer_params):
#     for model_param in ParameterGrid(linearSVC_params):
#         configuration_number += 1
#         print(configuration_number, vectorizer_param, model_param)

#         pipe = Pipeline([
#             ('tfidf', TfidfVectorizer(**vectorizer_param)),
#             ('linearSVC', LinearSVC(**model_param))
#         ])

#         pipe.fit(X_train, y_train)

#         f1 = f1_score(y_valid, pipe.predict(X_valid),average='macro')
#         report = classification_report(y_valid, pipe.predict(X_valid))
#         # confusion = confusion_matrix(y_valid, pipe.predict(X_valid))

#         with open(log_file_name, 'a') as file_log:
#             file_log.write(f'configuration_number: {configuration_number}\n')
#             file_log.write(f'vectorizer_param: \t{vectorizer_param}\n')
#             file_log.write(f'model_param: \t\t{model_param}\n')
#             file_log.write(f'f1_score:\t\t\t{f1}\n\n')
#             file_log.write(f'{report}\n\n')
#             file_log.write(f'{"*"*150}\n\n')
#         with open(results_file_name, 'a') as file_result:
#             file_result.write(f'{configuration_number},{vectorizer_param},{model_param},{f1}\n')

{'tfidf__stop_words': [None], 'tfidf__ngram_range': [(1, 1)], 'tfidf__max_features': [None, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000], 'linearSVC__penalty': ['l1'], 'linearSVC__dual': [False]}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END linearSVC__dual=False, linearSVC__penalty=l1, tfidf__max_features=None, tfidf__ngram_range=(1, 1), tfidf__stop_words=None;, score=0.774 total time=   3.5s
[CV 2/3] END linearSVC__dual=False, linearSVC__penalty=l1, tfidf__max_features=None, tfidf__ngram_range=(1, 1), tfidf__stop_words=None;, score=0.775 total time=   3.5s
[CV 3/3] END linearSVC__dual=False, linearSVC__penalty=l1, tfidf__max_features=None, tfidf__ngram_range=(1, 1), tfidf__stop_words=None;, score=0.774 total time=   3.6s
[CV 1/3] END linearSVC__dual=False, linearSVC__penalty=l1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1), tfidf__stop_words=None;, score=0.773 total time=   3.2s
[CV 2/3] END linearSVC__dual=False, li

In [57]:
gscv.cv_results_

{'mean_fit_time': array([3.0788544 , 2.61653725, 2.67823307, 2.76365209, 2.62333131,
        2.55601279, 2.55158011, 2.60176015, 2.61510547, 2.60183247,
        2.48423521, 2.59180554]),
 'std_fit_time': array([0.05695392, 0.07970918, 0.05352276, 0.04373008, 0.05687926,
        0.0360079 , 0.0691899 , 0.02841061, 0.01559789, 0.01321297,
        0.01071902, 0.07057189]),
 'mean_score_time': array([0.49050371, 0.49622178, 0.4972016 , 0.50624243, 0.49720343,
        0.47108515, 0.46942592, 0.47684622, 0.47558371, 0.47171442,
        0.47391669, 0.47064034]),
 'std_score_time': array([0.00104816, 0.00214462, 0.00671898, 0.00108378, 0.00478194,
        0.00092891, 0.00280431, 0.00194455, 0.00256603, 0.00093681,
        0.00098326, 0.00383064]),
 'param_linearSVC__dual': masked_array(data=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
              mask=[False, False, False, False, False, False, False, False,
                    Fals

# cuML

In [27]:
# from sklearn.model_selection import ParameterGrid
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from cuml.feature_extraction.text import TfidfVectorizer as cuml_TfidfVectorizer
# from sklearn.svm import LinearSVC
# from cuml.svm import LinearSVC as cuml_LinearSVC
# from sklearn.metrics import f1_score, classification_report, confusion_matrix

# from datetime import datetime
# from pathlib import Path

# logs_path = Path.cwd()/'logs'
# results_path = Path.cwd()/'results'
# logs_path.mkdir(exist_ok=True)
# results_path.mkdir(exist_ok=True)
# log_file_name = logs_path/f'log_out_{datetime.now()}.txt'
# results_file_name = results_path/f'results_out_{datetime.now()}.txt'
# with open(log_file_name, 'w') as file_log:
#     file_log.write('File output\n\n')
# with open(results_file_name, 'w') as file_result:
#     file_result.write('configuration_number,vectorizer_param,model_param,f1\n')

# df_final = df
# df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))

# mask_train_test = df_final['sentiment'].notna()

# X_train_valid = df_final.loc[mask_train_test,:]['text_final'].values
# y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
# X_test = df_final.loc[~mask_train_test,:]['text_final'].values

# X_train, X_valid, y_train, y_valid = train_test_split(
#     X_train_valid, 
#     y_train_valid, 
#     shuffle=True, 
#     # stratify=y_train_valid, 
#     train_size=0.8, 
#     random_state=50
# )

# configuration_number = 0
# print(f'Number of combinations:\t{len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))}')
# for vectorizer_param in ParameterGrid(TfidfVectorizer_params):
#     for model_param in ParameterGrid(linearSVC_params):
#         configuration_number += 1
#         print(configuration_number, vectorizer_param, model_param)

#         # pipe = Pipeline([
#         #     ('tfidf', TfidfVectorizer(**vectorizer_param)),
#         #     ('linearSVC', LinearSVC(**model_param))
#         # ])

#         tfidf = TfidfVectorizer(**vectorizer_param)
#         linearSVC = LinearSVC(**model_param)


#         cuml_X_train = cudf.DataFrame(cudf.Series(X_train, name='text'))
#         cuml_X_valid = cudf.DataFrame(cudf.Series(X_valid, name='text'))
#         cuml_y_train = cupy.asarray(y_train)

#         X_train = tfidf.fit_transform(X_train)
#         X_valid = tfidf.transform(X_valid)

#         linearSVC.fit(X_train,y_train)

#         y_pred = linearSVC.predict(X_valid)

#         y_pred = cupy.asnumpy(y_pred)

#         f1 = f1_score(y_valid, y_pred,average='macro')
#         report = classification_report(y_valid, y_pred)
#         # confusion = confusion_matrix(y_valid, y_pred)

#         with open(log_file_name, 'a') as file_log:
#             file_log.write(f'configuration_number: {configuration_number}\n')
#             file_log.write(f'vectorizer_param: \t{vectorizer_param}\n')
#             file_log.write(f'model_param: \t\t{model_param}\n')
#             file_log.write(f'f1_score:\t\t\t{f1}\n\n')
#             file_log.write(f'{report}\n\n')
#             file_log.write(f'{"*"*150}\n\n')
#         with open(results_file_name, 'a') as file_result:
#             file_result.write(f'{configuration_number},{vectorizer_param},{model_param},{f1}\n')
