In [15]:
from zipfile import ZipFile
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
df_dev = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'development.csv')
df_eval = pd.read_csv(Path.cwd()/'data'/'DSL2122_january_dataset'/'evaluation.csv')

In [17]:
df_eval.shape

(74999, 5)

In [18]:
df = df_dev.append(df_eval)

In [19]:
# df['date'] = df['date'].\
#     astype('string')\
#     .str.split(' ')\
#     .apply(lambda x : ' '.join([x[i] for i in [1,2,3,5]]))\
#     .pipe(pd.to_datetime)

### Stopwords analysis: sklearn, nltk, stop_words

#### nltk

In [20]:
from nltk import download as nltk_download

nltk_download('stopwords')

from nltk.corpus import stopwords

nltk_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/edoch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### sklearn

In [21]:
from sklearn.feature_extraction import text

sklearn_stopwords = list(text.ENGLISH_STOP_WORDS)

#### stop_words

In [22]:
# from stop_words import get_stop_words

# stop_words_stopwords = get_stop_words('english')

I should try to use both all stopwords, and sklearn and nltk stopwords singularly (and also no stopwords)

In [23]:
def stopwords_list_gen(source_list = [sklearn_stopwords,nltk_stopwords], generate_neg = True):
    stopwords_all_list = set()
    for source in source_list:
        stopwords_all_list = stopwords_all_list.union(set(source))
        if generate_neg:
            stopwords_all_list = stopwords_all_list.union(set([f'{i+"_neg"}' for i in source]))
    return stopwords_all_list

## Preprocessing

Tokenization

In [24]:
df['text_token'] = df['text'].str.split()

Removal of '@words'

In [25]:
df['text_noAt'] = df['text_token'].apply(lambda x : [i for i in x if not(i.startswith('@'))])

Removal of '&amp' and '&quot'

In [26]:
df['text_noAmpQuot'] = df['text_noAt']\
    .apply(lambda x : [i for i in x if '&amp' not in i])\
    .apply(lambda x : [i for i in x if '&quot' not in i])

Removal of repeated letters (incomplete)

In [27]:
a = 'ulaaa'

def remove_repeated_letters(word):
    for letter in word:
        print(letter)

# nltk_stemmer.stem(a)

Remove puntuation

In [28]:
import string as py_string
df['text_noPunct'] = df['text_noAmpQuot']\
    .apply(lambda x : [i.translate(str.maketrans('', '', py_string.punctuation)) for i in x])\
    .apply(lambda x : [i for i in x if i != ''])

Negation

In [29]:
from nltk.sentiment.util import mark_negation

df['text_neg'] = df['text_noPunct'].apply(lambda x : mark_negation(x))

Removal of stopwords

In [30]:
# considered_stopwords = stop_words_stopwords + [f'{word}_neg' for word in stop_words_stopwords]

# df['text_noStopwords'] = df['text_neg'].apply(lambda x : [i for i in x if i not in considered_stopwords])

Stemming and lemmatization

In [31]:
# from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

# nltk_download('wordnet')
# nltk_download('omw-1.4')

# nltk_stemmer = PorterStemmer()
# # nltk_stemmer = LancasterStemmer()
# # nltk_stemmer = SnowballStemmer('english')
# nltk_lemmatizer = WordNetLemmatizer()
# # from nltk.corpus import wordnet

# df['text_stemmed'] = df['text_noPunct'].apply(lambda x : [nltk_stemmer.stem(word) for word in x])
# df['text_stemmed'] = df['text_noPunct'].apply(lambda x : [nltk_lemmatizer.lemmatize(word) for word in x])

User manual filter

In [32]:
df = df.loc[
    (~(df['user'] == 'lost_dog') &
    ~(df['user'] == 'webwoke') &
    ~(df['user'] == 'tweetpet') &
    ~(df['user'].str.contains('tweeteradder')) &
    ~(df['user'].str.contains('tweetfollow')) &
    ~(df['user'] == 'divxdownloads')) |
    df['sentiment'].isna()
]

In [33]:
df[df['sentiment'].isna()].shape

(74999, 11)

## Model creation

In [85]:
from sklearn.model_selection import ParameterGrid

linearSVC_params = {
    'penalty' : ['l1','l2'],
    'dual' : [False],
    'tol' : [1e-3,1e-4,1e-5],
    'fit_intercept' : [True, False],
    'class_weight' : ['balanced', None],
    # 'verbose' : [1],
    'max_iter' : [100,300,500],
    'random_state' : [42],
    'C' : [1,10,50,100]
}

TfidfVectorizer_params = {
    'stop_words' : ['english', None],
    'ngram_range' : [(1,1),(1,2)],
    'max_features' : [300,1000,2000],
    'max_df' : [1.0,0.1,0.005],
    'min_df' : [1,0.0001,0.00001],
    'binary' : [True,False],
    'norm' : ['l1','l2'],
    'use_idf' : [True,False],
    'smooth_idf' : [True,False],
    'sublinear_tf' : [True,False]
}



In [86]:
print(len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))/13/60/60/24)

0.8861538461538462


In [60]:
# from datetime import datetime
# from pathlib import Path

# logs_path = Path.cwd()/'logs'
# results_path = Path.cwd()/'results'
# logs_path.mkdir(exist_ok=True)
# results_path.mkdir(exist_ok=True)
# log_file_name = logs_path/f'log_out_{datetime.now()}.txt'
# results_file_name = results_path/f'results_out_{datetime.now()}.txt'
# with open(log_file_name, 'w') as file_log:
#     file_log.write('File output\n\n')
# with open(results_file_name, 'w') as file_result:
#     file_result.write('')

# df_final = df
# df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC
# from sklearn.metrics import f1_score, classification_report, confusion_matrix

# configuration_number = 0
# for param in ParameterGrid(TfidfVectorizer_params):
#     configuration_number += 1
#     vectorizer = TfidfVectorizer(**param)

#     wpm = vectorizer.fit_transform(df_final['text_final'])

#     word_freq = pd.Series(
#         data = np.asarray(wpm.sum(axis=0)).squeeze(),
#         index = vectorizer.get_feature_names_out()
#     ).sort_values(ascending=False)

#     word_ind = [w in word_freq.index for w in vectorizer.get_feature_names_out()]

#     words_df = pd.DataFrame(
#         data = wpm[:,word_ind].toarray(),
#         columns = vectorizer.get_feature_names_out()[word_ind],
#         index = df_final.index
#     ).add_prefix('word_')

#     from sklearn.model_selection import train_test_split

#     mask_train_test = df_final['sentiment'].notna()

#     X_train_valid = words_df.loc[mask_train_test,:].values
#     y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
#     X_test = words_df.loc[~mask_train_test,:].values

#     X_train, X_valid, y_train, y_valid = train_test_split(
#         X_train_valid, 
#         y_train_valid, 
#         shuffle=True, 
#         # stratify=y_train_valid, 
#         train_size=0.8, 
#         random_state=50
#     )

#     rfc = LinearSVC(C=10, class_weight='balanced', dual=False, max_iter=100, random_state=42, tol=0.001)
#     rfc.fit(X_train, y_train)
    
#     f1 = f1_score(y_valid, rfc.predict(X_valid),average='macro')
#     report = classification_report(y_valid, rfc.predict(X_valid))
#     confusion = confusion_matrix(y_valid, rfc.predict(X_valid))

#     print(f1)
#     print(report)
#     print(confusion)

#     with open(log_file_name, 'a') as file_log:
#         file_log.write(f'{configuration_number}\n')
#         file_log.write(f'{param}\n')
#         file_log.write(f'\tf1_score:\t{f1}\n\n')
#         file_log.write(f'\t{report}\n\n')
#         file_log.write(f'{"*"*150}\n\n')
    
#     with open(results_file_name, 'a') as file_result:
#         file_result.write(f'{configuration_number},{param},{f1}\n')


0.695929687226944
              precision    recall  f1-score   support

         0.0       0.65      0.64      0.65     18783
         1.0       0.74      0.75      0.74     25807

    accuracy                           0.70     44590
   macro avg       0.70      0.70      0.70     44590
weighted avg       0.70      0.70      0.70     44590

[[12111  6672]
 [ 6537 19270]]
0.7167469965669069
              precision    recall  f1-score   support

         0.0       0.66      0.71      0.68     18783
         1.0       0.78      0.73      0.75     25807

    accuracy                           0.72     44590
   macro avg       0.72      0.72      0.72     44590
weighted avg       0.73      0.72      0.72     44590

[[13386  5397]
 [ 7050 18757]]


In [84]:
from datetime import datetime
from pathlib import Path

logs_path = Path.cwd()/'logs'
results_path = Path.cwd()/'results'
logs_path.mkdir(exist_ok=True)
results_path.mkdir(exist_ok=True)
log_file_name = logs_path/f'log_out_{datetime.now()}.txt'
results_file_name = results_path/f'results_out_{datetime.now()}.txt'
with open(log_file_name, 'w') as file_log:
    file_log.write('File output\n\n')
with open(results_file_name, 'w') as file_result:
    file_result.write('configuration_number,vectorizer_param,model_param,f1\n')

df_final = df
df_final['text_final'] = df_final['text_noPunct'].apply(lambda x : ' '.join(x))

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

configuration_number = 0
print(f'Number of combinations:\t{len(list(ParameterGrid(TfidfVectorizer_params)))*len(list(ParameterGrid(linearSVC_params)))}')
for vectorizer_param in ParameterGrid(TfidfVectorizer_params):
    for model_param in ParameterGrid(linearSVC_params):
        configuration_number += 1
        print(configuration_number, vectorizer_param, model_param)

        vectorizer = TfidfVectorizer(**vectorizer_param)

        wpm = vectorizer.fit_transform(df_final['text_final'])

        word_freq = pd.Series(
            data = np.asarray(wpm.sum(axis=0)).squeeze(),
            index = vectorizer.get_feature_names_out()
        ).sort_values(ascending=False)

        word_ind = [w in word_freq.index for w in vectorizer.get_feature_names_out()]

        words_df = pd.DataFrame(
            data = wpm[:,word_ind].toarray(),
            columns = vectorizer.get_feature_names_out()[word_ind],
            index = df_final.index
        ).add_prefix('word_')

        from sklearn.model_selection import train_test_split

        mask_train_test = df_final['sentiment'].notna()

        X_train_valid = words_df.loc[mask_train_test,:].values
        y_train_valid = df_final.loc[mask_train_test,:]['sentiment'].values
        X_test = words_df.loc[~mask_train_test,:].values

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_valid, 
            y_train_valid, 
            shuffle=True, 
            # stratify=y_train_valid, 
            train_size=0.8, 
            random_state=50
        )

        rfc = LinearSVC(**model_param)
        try: 
            rfc.fit(X_train, y_train)

            f1 = f1_score(y_valid, rfc.predict(X_valid),average='macro')
            report = classification_report(y_valid, rfc.predict(X_valid))
            confusion = confusion_matrix(y_valid, rfc.predict(X_valid))

            with open(log_file_name, 'a') as file_log:
                file_log.write(f'configuration_number: {configuration_number}\n')
                file_log.write(f'vectorizer_param: \t{vectorizer_param}\n')
                file_log.write(f'model_param: \t\t{model_param}\n')
                file_log.write(f'f1_score:\t\t\t{f1}\n\n')
                file_log.write(f'{report}\n\n')
                file_log.write(f'{"*"*150}\n\n')
            with open(results_file_name, 'a') as file_result:
                file_result.write(f'{configuration_number},{vectorizer_param},{model_param},{f1}\n')

        except ValueError as exception:
            with open(log_file_name, 'a') as file_log:
                file_log.write(f'configuration_number: {configuration_number}\n')
                file_log.write(f'vectorizer_param: \t{vectorizer_param}\n')
                file_log.write(f'model_param: \t\t{model_param}\n')
                file_log.write(f'{exception}\n\n')
                file_log.write(f'{"*"*150}\n\n')
            with open(results_file_name, 'a') as file_result:
                file_result.write(f'{configuration_number},{vectorizer_param},{model_param},NaN\n')

Number of combinations:	2
1 {'max_features': 300, 'smooth_idf': True, 'use_idf': False} {'dual': True, 'penalty': 'l1'}
2 {'max_features': 300, 'smooth_idf': True, 'use_idf': False} {'dual': True, 'penalty': 'l2'}
