# UniBa Sentiment Analysis 24/25

In [None]:
import sys
import os
import import_ipynb

# Add the folder containing "files.ipynb" to the Python path
sys.path.append(os.path.abspath("Utils"))
# Import the files notebook
import Utils.file_manager as fm

import numpy as np
import pandas as pd

SEED = 48
DIM_PRE_EMBEDDING = 200
ARRAY_DIRECTORY_PATH = "./Array_After_Preprocessing"


### Dataset '**Train**'

In [38]:
df = pd.read_csv("./Datasets/train.csv")

In [39]:
df.describe()

Unnamed: 0,id,polarity,text,source
count,8082,8082,8082,8082
unique,8082,3,8030,2
top,t1,neutral,"Excellent, thanks!",github
freq,1,3301,7,4985


In [40]:
df.source.value_counts()

source
github           4985
stackoverflow    3097
Name: count, dtype: int64

In [41]:
df.head()

Unnamed: 0,id,polarity,text,source
0,t1,negative,"Vineet, what you are trying to do is a terribl...",stackoverflow
1,t2,positive,"'Course I do, corrected.",stackoverflow
2,t3,positive,"Excellent, happy to help! If you don't mind, c...",stackoverflow
3,t6,negative,@talnicolas I'm using it a few dozen times in ...,stackoverflow
4,t7,neutral,I didn't select an answer because even though ...,stackoverflow


### Dataset '**Test_public**'

In [42]:
df_test_public = pd.read_csv("./Datasets/test_public.csv")

In [43]:
df_test_public.describe()

Unnamed: 0,id,text,source
count,3463,3463,3463
unique,3463,3454,2
top,t4,what about 3rd question?( When is it appropria...,github
freq,1,2,2137


In [44]:
df_test_public.source.value_counts()

source
github           2137
stackoverflow    1326
Name: count, dtype: int64

In [45]:
df_test_public.head()

Unnamed: 0,id,text,source
0,t4,@DrabJay: excellent suggestion! Code changed. :-),stackoverflow
1,t5,Any decent browser should protect against mali...,stackoverflow
2,t8,I swear - I don't put pseudo code I get told o...,stackoverflow
3,t9,I have attached below,stackoverflow
4,t13,When I refactor the following line: using Resh...,stackoverflow


## Preprocessing 1
### Dataset '**train**'

In [46]:
from sklearn.preprocessing import LabelEncoder

In [47]:
y1 = LabelEncoder().fit_transform(df.polarity)

## Preprocessing 2 - extract EMBEDDINGs

In [48]:
import spacy

In [49]:
# 1. Initialize a blank pipeline
nlp = spacy.blank("en")

### Import embeddings Model from disk

In [50]:
tok2vec = nlp.add_pipe("tok2vec", config={'model': {'width': DIM_PRE_EMBEDDING}})
tok2vec.from_disk("./Embedding_model_trained")

<spacy.pipeline.tok2vec.Tok2Vec at 0x1f498c51850>

### Dataset '**train**'
#### Get embedding for 'text' column

In [None]:
X_embedded = fm.import_array("X_embedded", ARRAY_DIRECTORY_PATH)

### Dataset '**test_public**'
#### Get embedding for 'text' column

In [None]:
X_TP_embedded = fm.import_array("X_TP_embedded", ARRAY_DIRECTORY_PATH)

## Preprocessing 3 - extract NEW FEATUREs

### Dataset '**train**'
#### Get new features from 'text' column

In [None]:
X_new_features = fm.import_array("X_new_features", ARRAY_DIRECTORY_PATH)

### Dataset '**test_public**'
#### Get new features from 'text' column

In [None]:
X_TP_new_feature = fm.import_array("X_TP_new_feature", ARRAY_DIRECTORY_PATH)

## Preprocessing 4 - extract BIGRAMs

### Dataset '**train**'
#### Get the most frequent bigrams from 'text' column

In [None]:
X_bigram = fm.import_array("X_bigram", ARRAY_DIRECTORY_PATH)

In [56]:
# Merge of all the columns after the transormations
X1 = np.concatenate([X_embedded, X_new_features, X_bigram], axis=1)

### Dataset '**test_public**'
#### Get the most frequent bigrams from 'text' column

In [None]:
X_TP_bigram = fm.import_array("X_TP_bigram", ARRAY_DIRECTORY_PATH)

In [None]:
# Merge of all the columns after the transormations
X_TP = np.concatenate([X_TP_embedded, X_TP_new_feature, X_TP_bigram], axis=1)

## Training

In [71]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import datetime

In [74]:
pipe = Pipeline([
    ('clf', GradientBoostingClassifier(subsample=0.8, min_samples_leaf=20, max_depth=15, random_state=SEED))
    ])

splits = StratifiedKFold(n_splits=5, shuffle=True, random_state= SEED).split(X1, y1)

param_grid = {
    'clf__n_estimators': [50, 70]
    }

In [75]:
best_estim_list = []
scores_list = []
params_list = []

# Outer loop
for i, (train_idx, test_idx) in enumerate(splits):

    print(f'SPLIT {i+1}')

    # Split data into training and test sets for the current split
    X_train, X_test = X1[train_idx], X1[test_idx]
    y_train, y_test = y1[train_idx], y1[test_idx]

    # Inner loop
    gs_CLS = GridSearchCV(estimator=pipe,
                                 param_grid=param_grid,
                                 cv=2,
                                 scoring='f1_macro',
                                 refit = True,
                                 n_jobs=2)
    


    START_t = datetime.datetime.now()

    # Fit the GridSearchCV
    gs_CLS.fit(X_train, y_train)

    END_t = datetime.datetime.now()



    # Retrieve and store the best model from GridSearchCV    
    best_estim_list.append(gs_CLS.best_estimator_)

    # Retrieve and store the best param from GridSearchCV
    params_list.append(gs_CLS.best_params_)

    # Print the best parameters and score
    print(f"SCORE sul train -> {gs_CLS.best_score_}\n Training time -> {END_t-START_t}\n")

    scores_list.append(gs_CLS.best_estimator_.score(X_test, y_test))

SPLIT 1
SCORE sul train -> 0.7294385206333928
 Training time -> 0:07:40.191032

SPLIT 2
SCORE sul train -> 0.7279638640151812
 Training time -> 0:07:34.528042

SPLIT 3
SCORE sul train -> 0.7248186252120182
 Training time -> 0:07:34.989887

SPLIT 4
SCORE sul train -> 0.7271517647833892
 Training time -> 0:07:38.904959

SPLIT 5
SCORE sul train -> 0.7373978356507728
 Training time -> 0:07:34.159410



### Best score on test set

In [76]:
max(scores_list)

0.7852722772277227

In [None]:
best_index = np.argmax(scores_list)

final_param = params_list[best_index]

print("The best parameters (for this Dataset) using GradientBoostingClassifier are:")
print(final_param)

I migliori parametri (per questo Dataset) usando il classificatore ... sono:
{'clf__n_estimators': 70}


In [79]:
classificator = best_estim_list[best_index]
classificator

### Prediction on test dataset 

In this are you won't find the test score because X_TP (test_public) don't contain the feature 'polarity'.  
When the challenge was over I got the whole test Dataset, so if you want you could use test.csv (path -> "./Datasets/test.csv") instead of test_public.

In [80]:
y_test_public = classificator.predict(X_TP)

### Make the test public dataset presentable

In [81]:
# Attach the id
col1 = np.asarray(df_test_public.loc[:, ["id"]])
col2 = np.reshape(y_test_public, (3463, 1))

y_submission = np.concatenate( (col1, col2), axis=1)

In [82]:
y_submission = pd.DataFrame(y_submission)
y_submission.columns = ['id', 'polarity']
y_submission.loc[:, "polarity"].value_counts()

polarity
1    1535
2    1072
0     856
Name: count, dtype: int64

In [None]:
y_submission.to_csv('./Submission/Submit_∞.csv', index=False) # ∞