# **Imports**

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


from nltk.util import ngrams
from itertools import product
import joblib

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Piotr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Piotr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Exploring data**

In [2]:
data = pd.read_csv("training.csv") # header = None names=["idk", "id", "date", "query", "nick", "content"])
# data.drop(["idk", "query", "nick"], axis=1, inplace=True)
test_data = pd.read_csv("test.csv")

In [3]:
data.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [5]:
data.tail()

Unnamed: 0,text,label
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3
15999,i know a lot but i feel so stupid because i ca...,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


# **Cleaning data**

In [7]:
lemmatizer = WordNetLemmatizer()

custom_stopwords = ["http", "https", "www", "com", "tinyurl"]

# credit to https://pytutorial.com/check-strig-url
url_pattern = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

In [8]:
def process_and_verify(line, custom_stopwords, url_pattern, lemmatizer):
    line = line.lower()
    line = remove_stopwords(line)
    res = []
    for word in str(line).split(' '):
        if url_pattern.match(word) is None:
            res.append(word)
            
    res = [lemmatizer.lemmatize(re.sub("[^a-zA-Z@' $]", " ", word)) for word in res]
    
    temp = []
    for word in res:
        for part in word.split(' '):
            part = lemmatizer.lemmatize(re.sub("[^a-zA-Z$]", "", part))
            # eliminate nicknames and words that are empty/of length 1
            if len(part) > 1 and word[0] != '@' and ' ' not in part and part not in custom_stopwords:
                temp.append(part)
    
    res = (' ').join(temp)
    res = remove_stopwords(res)
    return res

In [9]:
Xdata = np.asarray([process_and_verify(line, custom_stopwords, url_pattern, lemmatizer) for line in data['text']])

X_final_test = np.asarray([process_and_verify(line, custom_stopwords, url_pattern, lemmatizer) for line in test_data['text']])

# **Encoding data**

In [10]:
vectorizer = TfidfVectorizer()

Xdata_encoded = np.asarray(vectorizer.fit_transform(Xdata).todense())
X_final_test_encoded = np.asarray(vectorizer.transform(X_final_test).todense())

# **Searching for optimal hiperparameters**

In [11]:
# sample training just to see if everything works fine - accuracy around 85%

X_train, X_test, y_train, y_test = train_test_split(Xdata_encoded, data['label'], test_size=0.20)

classifier = LogisticRegression(solver='liblinear')
classifier.fit(X_train, y_train)

y_res = classifier.predict(X_test)
precision_recall_fscore_support(y_test, y_res)

accuracy_score(y_test, y_res)

# cross_val_score(classifier, Xdata_encoded, data['label'])

0.8503125

In [12]:
# generator for cartesian product of the dict - actually redudant in the code below
def params_product(params):
    return (dict(zip(params.keys(), values)) for values in product(*params.values()))

# for logistic regression
def custom_grid_search(X, y, params):
    X, y = shuffle(X, y)
    best_score = 0
    best_kwargs = dict()
    for kwargs in params_product(params):
        print(kwargs)
        classifier = LogisticRegression(**kwargs, solver='liblinear')
        score = sum(cross_val_score(classifier, X, y)) / 5
        print(score)
        if score > best_score:
            best_score = score
            best_kwargs = kwargs
    return best_kwargs

# has result only if key 'C' is in hiperparams
def adjust_regularization_parameter(X, y, hiperparams):
    if 'C' in hiperparams:
        best_score = 0
        best_c = 0
        
        for x in range(10*(hiperparams['C'] - 1), 10*(hiperparams['C'] + 1) + 1, 1):
            c = x / 10
            classifier = LogisticRegression(C=c, solver='liblinear')
            print(c)
            score = sum(cross_val_score(classifier, X, y)) / 5
            print(score)
            if score > best_score:
                best_score = score
                best_c = c
            
    return best_c


In [13]:
# looking for optimal regularization factor (the higher, the less complex the model)
params = {"C": [x for x in range(1, 14)]}
hiperparams = custom_grid_search(Xdata_encoded, data['label'], params)


{'C': 1}
0.8500624999999999
{'C': 2}
0.8703125
{'C': 3}
0.8761875
{'C': 4}
0.8778750000000001
{'C': 5}
0.8791874999999999
{'C': 6}
0.8805624999999999
{'C': 7}
0.880875
{'C': 8}
0.8811250000000002
{'C': 9}
0.8815
{'C': 10}
0.8815000000000002
{'C': 11}
0.882
{'C': 12}
0.8824375
{'C': 13}
0.8829374999999999


In [14]:
hiperparams['C'] = adjust_regularization_parameter(Xdata_encoded, data['label'], hiperparams)

12.0
0.8790625000000001
12.1
0.8790000000000001
12.2
0.8787500000000001
12.3
0.8786875000000001
12.4
0.8787499999999999
12.5
0.8787499999999999
12.6
0.8786875000000001
12.7
0.8786875000000001
12.8
0.8787499999999999
12.9
0.8786875000000001
13.0
0.8786250000000001
13.1
0.8785000000000001
13.2
0.8785000000000001
13.3
0.8784374999999999
13.4
0.8784374999999999
13.5
0.8783749999999999
13.6
0.8783749999999999
13.7
0.8783124999999998
13.8
0.8782499999999999
13.9
0.8782499999999999
14.0
0.8782500000000001


# **Training and testing model**

In [16]:
final_model = LogisticRegression(**hiperparams, solver='liblinear')
final_model.fit(Xdata_encoded, data['label'])

LogisticRegression(C=12.0, solver='liblinear')

In [17]:
y_predict = final_model.predict(X_final_test_encoded)

precision_recall_fscore_support(test_data['label'], y_predict)

(array([0.91808874, 0.89958159, 0.78571429, 0.875     , 0.86175115,
        0.81481481]),
 array([0.92598967, 0.92805755, 0.76100629, 0.86545455, 0.83482143,
        0.66666667]),
 array([0.92202228, 0.91359773, 0.77316294, 0.8702011 , 0.84807256,
        0.73333333]),
 array([581, 695, 159, 275, 224,  66], dtype=int64))

In [20]:
accuracy_score(test_data['label'], y_predict)

0.8865

# **Saving model to file**

In [18]:
joblib.dump(final_model, "ModelTextEmotions.sav")

['ModelTextEmotions.sav']