#### <h1>Φόρτωση βιβλιοθηκών</h1>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
import itertools
from sklearn.preprocessing import StandardScaler, LabelEncoder

## <h1>Φόρτωση των δεδομένων μας</h1>

In [None]:
df_train_set = pd.read_csv('/kaggle/input/ys19-2023-assignment-1/train_set.csv')
df_test_set = pd.read_csv('/kaggle/input/ys19-2023-assignment-1/test_set.csv')
df_valid_set = pd.read_csv('/kaggle/input/ys19-2023-assignment-1/valid_set.csv')

**Let's check for null values**

In [None]:
print(df_train_set.head(),'\n')
print(df_train_set.info(), '\n')

print(df_valid_set.head(),'\n')
print(df_valid_set.info(), '\n')

print(df_test_set.head(),'\n')
print(df_test_set.info(), '\n')

**Let's create some barplots that illustrate the number of tweets of each sentiment for each party**

In [None]:
group_df_by_sentiment_party_train = df_train_set.groupby(['Sentiment', 'Party']).size().reset_index(name='NumOfTweets')
group_df_by_sentiment_party_valid = df_valid_set.groupby(['Sentiment', 'Party']).size().reset_index(name='NumOfTweets')

**Below we can observe the number of tweets and their sentiment for each party for the train and valid sets**

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x='Party', y='NumOfTweets', hue='Sentiment', data=group_df_by_sentiment_party_train)
plt.title('Number of Tweets/Sentiment per Party for Train set')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x='Party', y='NumOfTweets', hue='Sentiment', data=group_df_by_sentiment_party_valid)
plt.title('Number of Tweets/Sentiment per Party for Valid set')
plt.show()

**Plot of the number of tweets for concerning each party of the test set**

In [None]:
df_test_set['Party'].value_counts().plot(kind='bar', figsize=(12,8))
plt.title('Number of Tweets/Party')
plt.ylabel('Num of Tweets')
plt.xlabel('Party')
plt.xticks(rotation=45)
plt.show()


<h1>Data Preprocessing</h1>

**Turn the categorical classes of the train and the validation sets to numerical**

In [None]:
df_train_set['Sentiment'].head()

In [None]:
le = LabelEncoder()

df_train_set['Sentiment'] = le.fit_transform(df_train_set['Sentiment'])
df_valid_set['Sentiment'] = le.fit_transform(df_valid_set['Sentiment'])
print(df_train_set['Sentiment'].head())
print(df_valid_set['Sentiment'].head())

**Now let's create a function that turn the text of each tweet to lowercase, removes stopwords and special charachters, urls, mentions e.t.c**

In [None]:
# NOTE: To remove the stopwords I downloaded locally the stopwords-el.json file from the repository
# at https://github.com/stopwords-iso/stopwords-el and uploaded it
# to my notebook at gree-stopwords-json-file.

# Load Greek stopwords from the JSON file
with open('/kaggle/input/greek-stopwords-2/stopwords_el_2.json', 'r', encoding='utf-8') as file:
    greek_stopwords = json.load(file)

def preprocess_tweet(tweet):
    tweet = tweet.lower().replace('_', ' ')
    
    # delete mentions
    tweet = re.sub(r'@\w+', '', tweet)
    
    # delete urls
    tweet = re.sub(r'http\S+', '', tweet)
    
    # delete special characters but keep the alphanumeric ones, including all Greek letters
    #tweet = re.sub(r'[^αβγδεζηθικλμνξοπρστυφχψωςάέίόώύήΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩa-zA-Z0-9\s]', '', tweet)
    
    # I keep only greek charachters (I used to keep and english, I am trying it this way to see if I
    # will achieve higher f1 score)
    tweet = re.sub(r'[^αβγδεζηθικλμνξοπρστυφχψωςάέίόώύήΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ0-9\s]', '', tweet)
    
    # delete Greek stopwords
    tweet_words = tweet.split()
    cleaned_words = [word for word in tweet_words if word not in greek_stopwords]
    tweet = ' '.join(cleaned_words)

    tweet = tweet.strip()
    
    return tweet


In [None]:
df_train_set['Text'] = df_train_set['Text'].apply(preprocess_tweet)
df_test_set['Text'] = df_test_set['Text'].apply(preprocess_tweet)
df_valid_set['Text'] = df_valid_set['Text'].apply(preprocess_tweet)

In [None]:
print(df_train_set['Text'].head(), '\n')
print(df_test_set['Text'].head(), '\n')
print(df_valid_set['Text'].head(), '\n')

****

**We load the spacy model to perform lemmatization tokenaziation for greek words** <br>
Sometimes it's necessary to restart the kernel in order for the following to work

<h2>ATTENTION:</h3><h2>The following command needs to be executed only one time. If an error occurs from the following spacy.load() command just restart the kernel and run all the commands except this one.</h4>

In [None]:
%%capture
# This needs to be executed only one time. If an error occurs from the following spacy.load() command
# just restart the kernel and run all the commands except this one.
!pip install -U spacy  

In [None]:
nlp = spacy.load('/kaggle/input/el-core-news-lg-3/el_core_news_lg_3/el_core_news_lg-3.7.0')

**Below we perform lemmatization and tokenization**

In [None]:
# For the lemmatization tokenazation step, I downloaded locally the el_core_news_lg model,
# then I zipped it and I uploaded it as a public dataset.
def lemmatize_tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])


df_train_set['Text'] = df_train_set['Text'].apply(lemmatize_tokenize_text)
df_test_set['Text'] = df_test_set['Text'].apply(lemmatize_tokenize_text)
df_valid_set['Text'] = df_valid_set['Text'].apply(lemmatize_tokenize_text)

In [None]:
print(df_train_set['Text'].head(), '\n')
print(df_test_set['Text'].head(), '\n')
print(df_valid_set['Text'].head(), '\n')

Now we will calculate the number of unique words in each **Text** column and also plot a **Word cloud** for each of these columns

In [None]:
def unique_words_num(tweets):
    # Function that counts the number of the unique words from the Text column of each dataframe
    words = set() 
    for tweet in tweets:
        words.update(tweet.split())
    return len(words)

In [None]:
print("Num of unique words in df_train_set:", unique_words_num(df_train_set['Text']))
print("Num of unique words in df_test_set:", unique_words_num(df_test_set['Text']))
print("Num of unique words in df_valid_set:", unique_words_num(df_valid_set['Text']))

Now let's plot the wordcloud for each dataframe

In [None]:
def plot_wordcloud(tweets, title):
    tweets_joined = ' '.join(tweets)
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(tweets_joined)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

# Plot word cloud for each dataframe
plot_wordcloud(df_train_set['Text'], 'df_train_set')
plot_wordcloud(df_test_set['Text'], 'df_test_set')
plot_wordcloud(df_valid_set['Text'], 'df_valid_set')

<h1>TF-IDF Vectorizer</h1>
Now we will use the TF-IDF Vectorizer to transform our text data to numerical data that can be interpreted by our model

In [None]:
Y_train = df_train_set['Sentiment']
Y_val = df_valid_set['Sentiment']

In [None]:
def choose_max_features(df_train_set, df_test_set, df_valid_set, Y_train, Y_val, max_features):
    tf_vec = TfidfVectorizer(max_features=max_features)
    X_train_tf = tf_vec.fit_transform(df_train_set['Text'])
    X_test_tf = tf_vec.transform(df_test_set['Text'])
    X_valid_tf = tf_vec.transform(df_valid_set['Text'])
    
    list_f1 = []
    list_f1_train = []
    list_sample_size = []

    for times in range(9):
        X_subset, _, y_subset, _ = train_test_split(X_train_tf, Y_train, train_size=(times * 0.1 + 0.1))
        classifier = LogisticRegression(max_iter=2000)
        classifier.fit(X_subset, y_subset)

        results_train = classifier.predict(X_subset)
        results = classifier.predict(X_valid_tf)

        f1_train = f1_score(y_subset, results_train, average='macro')
        f1 = f1_score(Y_val, results, average='macro')

        list_f1.append(f1)
        list_f1_train.append(f1_train)
        list_sample_size.append((times * 0.1 + 0.1))

    mean_f1_train = np.mean(list_f1_train)
    mean_f1_validation = np.mean(list_f1)

    return list_sample_size, list_f1, list_f1_train, mean_f1_train, mean_f1_validation, X_train_tf, X_test_tf, X_valid_tf




In [None]:
def plot_learning_curve(list_sample_size, list_f1, list_f1_train):
    plt.plot(list_sample_size, list_f1)
    plt.plot(list_sample_size, list_f1_train)
    plt.ylim(ymin=0)
    plt.legend(["Validation", "Training"])
    plt.xlabel("Proportion of Training Data")
    plt.ylabel("F1 Score")
    plt.show()

**Below we experimenting with different values for the max_features hyperparameter**

In [None]:
f1_scores_data = []

for i in range(1000, 12000, 2000):
    print(f'\n\n FOR max_features = {i} \n\n')
    list_sample_size, list_f1, list_f1_train, mean_f1_train, mean_f1_validation, _, _, _ = choose_max_features(df_train_set, df_test_set, df_valid_set, Y_train, Y_val, i)
    f1_scores_data.append({'max_features': i, 'mean_f1_train': mean_f1_train, 'mean_f1_validation': mean_f1_validation})
    plot_learning_curve(list_sample_size, list_f1, list_f1_train)

f1_scores_df = pd.DataFrame(f1_scores_data)
print(f1_scores_df)

best_max_features = f1_scores_df.loc[f1_scores_df['mean_f1_validation'].idxmax(), 'max_features']
print(f'The best value for the max_features hyperparameter is {best_max_features}')

We will also experiment with the **ngram_range** hyperparameter

In [None]:
def choose_ngram_range(df_train_set, df_test_set, df_valid_set, Y_train, Y_val, ngram_range):
    tf_vec = TfidfVectorizer(ngram_range=ngram_range)
    X_train_tf = tf_vec.fit_transform(df_train_set['Text'])
    X_test_tf = tf_vec.transform(df_test_set['Text'])
    X_valid_tf = tf_vec.transform(df_valid_set['Text'])
    
    list_f1 = []
    list_f1_train = []
    list_sample_size = []

    for times in range(9):
        X_subset, _, y_subset, _ = train_test_split(X_train_tf, Y_train, train_size=(times * 0.1 + 0.1))
        classifier = LogisticRegression(max_iter=2000)
        classifier.fit(X_subset, y_subset)

        results_train = classifier.predict(X_subset)
        results = classifier.predict(X_valid_tf)

        f1_train = f1_score(y_subset, results_train, average='macro')
        f1 = f1_score(Y_val, results, average='macro')

        list_f1.append(f1)
        list_f1_train.append(f1_train)
        list_sample_size.append((times * 0.1 + 0.1))

    mean_f1_train = np.mean(list_f1_train)
    mean_f1_validation = np.mean(list_f1)

    return list_sample_size, list_f1, list_f1_train, mean_f1_train, mean_f1_validation, X_train_tf, X_test_tf, X_valid_tf


In [None]:
ngram_ranges = [
    (1, 1),  
    (1, 2),  
    (2, 2)]#,  
    #(1, 3),  
    #(2, 3),  
    #(1, 4)   
#] 
ngram_scores_data = []

for ngram in ngram_ranges:
    print(f'\n\n FOR ngram_range = {ngram} \n\n')
    list_sample_size, list_f1, list_f1_train, mean_f1_train, mean_f1_validation, _, _, _ = choose_ngram_range(df_train_set, df_test_set, df_valid_set, Y_train, Y_val, ngram)
    ngram_scores_data.append({'ngram_range': ngram, 'mean_f1_train': mean_f1_train, 'mean_f1_validation': mean_f1_validation})
    plot_learning_curve(list_sample_size, list_f1, list_f1_train)

ngram_scores_df = pd.DataFrame(ngram_scores_data)
print(ngram_scores_df)

best_ngram_range = ngram_scores_df.loc[ngram_scores_df['mean_f1_validation'].idxmax(), 'ngram_range']
print(f'The best ngram_range for the TF-IDF vectorizer is {best_ngram_range}')

In [None]:
tf_vec = TfidfVectorizer(max_features=1000, ngram_range = (1,2)) # default ngram_range = (1,1)
X_train_tf = tf_vec.fit_transform(df_train_set['Text'])
X_test_tf = tf_vec.transform(df_test_set['Text'])
X_valid_tf = tf_vec.transform(df_valid_set['Text'])

<h1> Model hyperparameters experiments </h1>

In [None]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag']
penalty = ['l1', 'l2', 'elasticnet']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
random_state = [1, 2, 3, None]
max_iter = list(range(1000, 10000, 1000))
multi_class = ['auto', 'ovr', 'multinomial']
combinations = list(itertools.product(solvers, penalty, C, random_state, max_iter, multi_class))

In [None]:
def plot_learning_curves(list_sample_size, list_f1, list_f1_train, list_recall, list_recall_train, list_precision, list_precision_train):
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    
    #f1 plot
    axs[0].plot(list_sample_size, list_f1, 'b', label='Validation F1')
    axs[0].plot(list_sample_size, list_f1_train, 'r', label='Training F1')
    axs[0].set_title('F1 Score vs. Proportion of Training Data')
    axs[0].set_xlabel('Proportion of Training Data')
    axs[0].set_ylabel('F1 Score')
    axs[0].legend()
    axs[0].set_ylim([0, 1])

    #recall plot
    axs[1].plot(list_sample_size, list_recall, 'b', label='Validation Recall')
    axs[1].plot(list_sample_size, list_recall_train, 'r', label='Training Recall')
    axs[1].set_title('Recall vs. Proportion of Training Data')
    axs[1].set_xlabel('Proportion of Training Data')
    axs[1].set_ylabel('Recall')
    axs[1].legend()
    axs[1].set_ylim([0, 1])

    #precision plot
    axs[2].plot(list_sample_size, list_precision, 'b', label='Validation Precision')
    axs[2].plot(list_sample_size, list_precision_train, 'r', label='Training Precision')
    axs[2].set_title('Precision vs. Proportion of Training Data')
    axs[2].set_xlabel('Proportion of Training Data')
    axs[2].set_ylabel('Precision')
    axs[2].legend()
    axs[2].set_ylim([0, 1])

    plt.tight_layout()
    plt.show()


<h2>Experiments with iterations</h2>

The number of iterations are the number of times the algorithm will loop through the entire training dataset. Each iteration is a step towards optimizing the weights of the features in the dataset, with the goal being that of minimizing the error of the prediction the model made. <br>

**During each iteration, the algorithm performs the following steps:**
<ul>
    <li>Calculation: It calculates the predictions based on the current weights.</li>
    <li>Comparison: It compares the predictions with the actual targets.</li>
    <li>Error Measurement: It measures the error of these predictions.</li>
    <li>Adjustment: It adjusts the weights to reduce the error.</li>
</ul>

These steps are repeated for the specified number of iterations or until the improvement becomes negligibly small, which indicates that the model has converged to a solution. <br>

The number of iterations is a critical hyperparameter. Too few iterations might mean the model hasn't learned enough from the data, resulting in underfitting, while too many iterations can lead to overfitting, where the model learns too much from the training data, including the noise, leading to poor generalization on unseen data.

In [None]:
def test_iterations(X_train_tf, X_valid_tf, Y_train, Y_val, max_iter_list):
    best_f1 = -1
    best_iter = None
    iter_results = []
    best_conf_matrix = None

    for itr in max_iter_list:
        list_f1 = []
        list_f1_train = []
        list_sample_size = []

        for times in range(1, 10):
            #split the data
            X_subset, _, y_subset, _ = train_test_split(X_train_tf, Y_train, train_size=times/10)
            classifier = LogisticRegression(max_iter=itr)
            classifier.fit(X_subset, y_subset)

            #make predictions
            y_pred_train = classifier.predict(X_subset)
            y_pred_val = classifier.predict(X_valid_tf)

            #calculate f1 train and f1 validation
            f1_train = f1_score(y_subset, y_pred_train, average='macro')
            f1_val = f1_score(Y_val, y_pred_val, average='macro')

            list_f1_train.append(f1_train)
            list_f1.append(f1_val)

        
        avg_f1 = np.mean(list_f1)
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_iter = itr
            best_conf_matrix = confusion_matrix(Y_val, y_pred_val)

        
        iter_results.append({
            'max_iter': itr,
            'mean_f1_train': np.mean(list_f1_train),
            'mean_f1_validation': np.mean(list_f1)
        })

        # plot learning curve
        plt.figure()
        plt.plot(range(1, 10), list_f1_train, label='Training F1')
        plt.plot(range(1, 10), list_f1, label='Validation F1')
        plt.title(f'Learning Curves for max_iter={itr}')
        plt.xlabel('Proportion of Training Data')
        plt.ylabel('F1 Score')
        plt.legend()
        plt.show()

    results_df = pd.DataFrame(iter_results)
    print(results_df)


    if best_conf_matrix is not None:
        sns.heatmap(best_conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for max_iter={best_iter}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()

    return best_iter, results_df

Usually the values that the above function returns for **max_iter** hyperparameter is **1000**, **4000** and **9000**. I chose to use the 9000 for the max_iter so to avoid the ITERATIONS REACHED LIMIT error.

In [None]:
best_max_iter, iter_df = test_iterations(X_train_tf, X_valid_tf, Y_train, Y_val, max_iter)
print(f"The best max_iter is: {best_max_iter}")

<h2> Experiments with solvers and penalty </h2>


In [None]:
# since the number of iterations which gave me the best results is 4000 I will assign that number
# to a variable and I will proceed like that similarly in the following experiments.
best_iter = 4000 

The **solver** and **penalty** hyperparameters in Logistic Regression are crucial as they define the optimization algorithm used for minimizing the cost function and the regularization technique applied to prevent overfitting, respectively.<br>

**Solver**: specifies the algorithm to use in the optimization problem.<br>
<ul>
<li>lbfgs, newton-cg, and sag are more suitable for large datasets as they handle multinomial loss and produce more robust results for multi-class problems.</li>
<li>liblinear is a good choice for small datasets and supports both l1 and l2 regularization.</li>
<li>saga is a variant of sag that also supports the l1 penalty, and is generally faster for large datasets.</li>
</ul><br>

**Penalty**: specifies the norm used in the penalization. Regularization is applied to the cost function to avoid overfitting by discouraging complex models:
<ul>
<li>l1 can lead to sparse models with coefficients that can be exactly zero. It’s useful for feature selection as it tends to shrink coefficients for less important features to zero.</li>
<li>l2 tends to shrink coefficients evenly but typically does not set them to zero. This is often more appropriate for problems with many correlated features.</li>
</ul>

In [None]:
def test_solvers_and_penalty(X_train_tf, X_valid_tf, Y_train, Y_val, solvers, best_iter):


    best_f1 = -1
    best_solver = None
    best_penalty = None
    best_model = None
    sc = StandardScaler(with_mean=False)

    # DataFrame to store F1 scores for each solver and penalty
    df_scores = pd.DataFrame(columns=['Solver', 'Penalty', 'F1 Train Score', 'F1 Validation Score'])

    for solve in solvers:
        list_f1 = []
        list_f1_train = []
        list_recall = []
        list_recall_train = []
        list_precision = []
        list_precision_train = []
        list_sample_size = []
        current_f1_list = []
        
        if solve in ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']:
            penalty = 'l2'
            if solve == 'sag': # we need to scale our data
                
                X_train_tf = sc.fit_transform(X_train_tf)
                X_valid_tf = sc.transform(X_valid_tf)
        if solve == 'liblinear':
            penalty = 'l1'
            

        
        for times in range(9):

            X_subset, _, y_subset, _ = train_test_split(X_train_tf, Y_train, train_size=(times * 0.1 + 0.1))

            classifier = LogisticRegression(max_iter=best_iter, solver=solve, penalty=penalty)
            classifier.fit(X_subset, y_subset)

            results_train = classifier.predict(X_subset)


            results = classifier.predict(X_valid_tf)


            f1_train = f1_score(y_subset, results_train, average='macro')
            recall_train = recall_score(y_subset, results_train, average='macro')
            precision_train = precision_score(y_subset, results_train, average='macro')

            list_f1_train.append(f1_train)
            list_recall_train.append(recall_train)
            list_precision_train.append(precision_train)


            f1 = f1_score(Y_val, results, average='macro')
            recall = recall_score(Y_val, results, average='macro')
            precision = precision_score(Y_val, results, average='macro')

            list_f1.append(f1)
            list_recall.append(recall)
            list_precision.append(precision)
            list_sample_size.append((times * 0.1 + 0.1))
            
            current_f1_list.append(f1)
        
        avg_f1_train = np.mean(list_f1_train)
        avg_f1_validation = np.mean(list_f1)
        

        temp_df = pd.DataFrame({
            'Solver': [solve], 
            'Penalty': [penalty], 
            'F1 Train Score': [avg_f1_train], 
            'F1 Validation Score': [avg_f1_validation]
        })
        df_scores = pd.concat([df_scores, temp_df], ignore_index=True)
        #df_scores = pd.DataFrame(columns=['Solver', 'Penalty', 'F1 Train Score', 'F1 Validation Score'])
        

        avg_f1 = np.mean(current_f1_list)
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_solver = solve
            best_penalty = penalty
        
        print(f'\n\n FOR solver = {solve} AND penalty = {penalty}\n\n')
        plot_learning_curves(list_sample_size, list_f1, list_f1_train, list_recall, list_recall_train, list_precision, list_precision_train)
    print(f'Best F1 score {best_f1:.2f} achieved with {best_solver} solver and penalty {best_penalty}.')


    print("\nF1 Scores for each Solver and Penalty:")
    print(df_scores)

    best_model = LogisticRegression(max_iter=best_iter, solver=best_solver, penalty=best_penalty)
    best_model.fit(X_train_tf, Y_train)
    best_predictions = best_model.predict(X_valid_tf)
    cm = confusion_matrix(Y_val, best_predictions)


    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return best_solver, best_penalty

In [None]:
best_solver, best_penalty = test_solvers_and_penalty(X_train_tf, X_valid_tf, Y_train, Y_val, solvers, best_iter)
print(best_solver, best_penalty)

In [None]:
best_solver, best_penalty = 'newton-cg', 'l2'

<h2>Experiments with C</h2><br>

The C hyperparameter plays a critical role in controlling the trade-off between achieving a low error on the training data and maintaining a model that generalizes well to unseen data. <br>
This hyperparameter is directly tied to the regularization strength with its value being the inverse of the regularization strength. <br>

**Low values of C**:<br>

Increase the regularization strength, which creates simpler models that may underfit the training data. This is because the optimization function will prioritize the simplicity (smaller coefficients, depending on the norm used in penalization) of the model over fitting the training data perfectly.
Useful when we believe the data is very noisy and we need to prevent the model from learning the noise.<br>

**High values of C**:<br>

Decrease the regularization strength, allowing the models to become more complex and fit the training data better. This can lead to overfitting if the model starts to learn the noise and detailed fluctuations within the training data.
Useful when the model suffers from high bias, i.e., it is too simple and does not capture the underlying trends well.

In [None]:
def test_C(X_train_tf, X_valid_tf, Y_train, Y_val, C, best_iter, best_solver, best_penalty):

    
    best_f1 = -1
    best_C = None
    best_model = None
    #sc = StandardScaler(with_mean=False)

    
    df_scores = pd.DataFrame(columns=['C', 'F1 Train Score', 'F1 Validation Score'])

    for c in C:
        list_f1 = []
        list_f1_train = []
        list_recall = []
        list_recall_train = []
        list_precision = []
        list_precision_train = []
        list_sample_size = []
        current_f1_list = []
        
            
        for times in range(9):
            
            X_subset, _, y_subset, _ = train_test_split(X_train_tf, Y_train, train_size=(times * 0.1 + 0.1))

            classifier = LogisticRegression(max_iter=best_iter, solver=best_solver, penalty=best_penalty, C=c)
            classifier.fit(X_subset, y_subset)

            results_train = classifier.predict(X_subset)

            
            results = classifier.predict(X_valid_tf)

            
            f1_train = f1_score(y_subset, results_train, average='macro')
            recall_train = recall_score(y_subset, results_train, average='macro')
            precision_train = precision_score(y_subset, results_train, average='macro')

            list_f1_train.append(f1_train)
            list_recall_train.append(recall_train)
            list_precision_train.append(precision_train)

            
            f1 = f1_score(Y_val, results, average='macro')
            recall = recall_score(Y_val, results, average='macro')
            precision = precision_score(Y_val, results, average='macro')

            list_f1.append(f1)
            list_recall.append(recall)
            list_precision.append(precision)
            list_sample_size.append((times * 0.1 + 0.1))
            
            current_f1_list.append(f1)
        
        avg_f1_train = np.mean(list_f1_train)
        avg_f1_validation = np.mean(list_f1)
        
        
        temp_df = pd.DataFrame({
            'C': [c], 
            'F1 Train Score': [avg_f1_train], 
            'F1 Validation Score': [avg_f1_validation]
        })
        df_scores = pd.concat([df_scores, temp_df], ignore_index=True)
        
        
        avg_f1 = np.mean(current_f1_list)
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_C = c

        
        print(f'\n\n FOR C = {c}\n\n')
        plot_learning_curves(list_sample_size, list_f1, list_f1_train, list_recall, list_recall_train, list_precision, list_precision_train)
    print(f'Best F1 score {best_f1:.2f} achieved with C = {best_C}.')

    
    print("\nF1 Scores for each value of C:")
    print(df_scores)

    best_model = LogisticRegression(max_iter=best_iter, solver=best_solver, penalty=best_penalty, C=best_C)
    best_model.fit(X_train_tf, Y_train)
    best_predictions = best_model.predict(X_valid_tf)
    cm = confusion_matrix(Y_val, best_predictions)

    # plot the confusion matrix with seaborn
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return best_C

In [None]:
best_c = test_C(X_train_tf, X_valid_tf, Y_train, Y_val, C, best_iter, best_solver, best_penalty)
print(best_c)

In [None]:
best_C = 0.1

<h2>Experiments with the multi_class hyperparameter</h2>

In [None]:
multi_class_ls = ['auto', 'ovr', 'multinomial']

In [None]:
def test_multi_class(X_train_tf, X_valid_tf, Y_train, Y_val, multi_class_ls, best_iter, best_solver, best_penalty , best_C):

    
    best_f1 = -1  
    best_multi_class_val = None  
    best_model = None



    df_scores = pd.DataFrame(columns=['multi_class', 'F1 Train Score', 'F1 Validation Score'])

    for multi_class_val in multi_class_ls:
        list_f1 = []
        list_f1_train = []
        list_recall = []
        list_recall_train = []
        list_precision = []
        list_precision_train = []
        list_sample_size = []
        current_f1_list = []
        
            
        for times in range(9):

            X_subset, _, y_subset, _ = train_test_split(X_train_tf, Y_train, train_size=(times * 0.1 + 0.1))

            classifier = LogisticRegression(max_iter=best_iter, solver=best_solver, penalty=best_penalty, C=best_C, multi_class=multi_class_val)
            classifier.fit(X_subset, y_subset)

            results_train = classifier.predict(X_subset)


            results = classifier.predict(X_valid_tf)


            f1_train = f1_score(y_subset, results_train, average='macro')
            recall_train = recall_score(y_subset, results_train, average='macro')
            precision_train = precision_score(y_subset, results_train, average='macro')

            list_f1_train.append(f1_train)
            list_recall_train.append(recall_train)
            list_precision_train.append(precision_train)


            f1 = f1_score(Y_val, results, average='macro')
            recall = recall_score(Y_val, results, average='macro')
            precision = precision_score(Y_val, results, average='macro')

            list_f1.append(f1)
            list_recall.append(recall)
            list_precision.append(precision)
            list_sample_size.append((times * 0.1 + 0.1))
            
            current_f1_list.append(f1)
        
        avg_f1_train = np.mean(list_f1_train)
        avg_f1_validation = np.mean(list_f1)
        

        temp_df = pd.DataFrame({
            'multi_class': [multi_class_val], 
            'F1 Train Score': [avg_f1_train], 
            'F1 Validation Score': [avg_f1_validation]
        })
        df_scores = pd.concat([df_scores, temp_df], ignore_index=True)
        

        avg_f1 = np.mean(current_f1_list)
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_multi_class_val = multi_class_val

        
        print(f'\n\n FOR multi_class = {multi_class_val}\n\n')
        plot_learning_curves(list_sample_size, list_f1, list_f1_train, list_recall, list_recall_train, list_precision, list_precision_train)
    print(f'Best F1 score {best_f1:.2f} achieved with multi_class = {best_multi_class_val}.')


    print("\nF1 Scores for each value of multi_class:")
    print(df_scores)

    best_model = LogisticRegression(max_iter=best_iter, solver=best_solver, penalty=best_penalty, C=best_C, multi_class=best_multi_class_val)
    best_model.fit(X_train_tf, Y_train)
    best_predictions = best_model.predict(X_valid_tf)
    cm = confusion_matrix(Y_val, best_predictions)


    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return best_multi_class_val

In [None]:
best_multiclass_val = test_multi_class(X_train_tf, X_valid_tf, Y_train, Y_val, multi_class_ls, best_iter, best_solver, best_penalty , best_C)
print(best_multiclass_val)

In [None]:
best_multiclass_val = 'multinomial'

<h2> No I will combine the train set and the validation set and perform kfold crossvalidation</h2>

In [None]:
def plot_validation_scores(average_f1, average_recall, average_precision):
    metrics = ['F1 Score', 'Recall', 'Precision']
    average_scores = [average_f1, average_recall, average_precision]

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(metrics, average_scores, color=['blue', 'green', 'red'])

    ax.set_title('Validation Scores')
    ax.set_ylim([0, 1])
    ax.set_ylabel('Scores')
    
    for i, v in enumerate(average_scores):
        ax.text(i, v + 0.02, f"{v:.2f}", ha='center', va='bottom')

    plt.tight_layout()
    plt.show()





In [None]:
# Combine train and valid set
X_combined = np.vstack((X_train_tf.toarray(), X_valid_tf.toarray()))
y_combined = np.hstack((Y_train, Y_val))

In [None]:

kfold = StratifiedKFold(n_splits=5)


list_f1_scores_train = []
list_recall_scores_train = []
list_precision_scores_train = []
list_f1_scores_val = []
list_recall_scores_val = []
list_precision_scores_val = []


# k-fold cross-validation
for train_index, val_index in kfold.split(X_combined, y_combined):

    X_train_fold, X_val_fold = X_combined[train_index], X_combined[val_index]
    y_train_fold, y_val_fold = y_combined[train_index], y_combined[val_index]
    

    model = LogisticRegression(multi_class=best_multiclass_val, C=best_C, solver=best_solver, penalty=best_penalty, max_iter=best_iter)
    model.fit(X_train_fold, y_train_fold)
    
    predictions_train = model.predict(X_train_fold)
    f1_train = f1_score(y_train_fold, predictions_train, average='macro')
    recall_train = recall_score(y_train_fold, predictions_train, average='macro')
    precision_train = precision_score(y_train_fold, predictions_train, average='macro', zero_division=0)
    
    list_f1_scores_train.append(f1_train)
    list_recall_scores_train.append(recall_train)
    list_precision_scores_train.append(precision_train)
    
    predictions_val = model.predict(X_val_fold)
    f1_val = f1_score(y_val_fold, predictions_val, average='macro')
    recall_val = recall_score(y_val_fold, predictions_val, average='macro')
    precision_val = precision_score(y_val_fold, predictions_val, average='macro', zero_division=0)
    

    list_f1_scores_val.append(f1_val)
    list_recall_scores_val.append(recall_val)
    list_precision_scores_val.append(precision_val)


    
average_f1_train = np.mean(list_f1_scores_train)
average_recall_train = np.mean(list_recall_scores_train)
average_precision_train = np.mean(list_precision_scores_train)    

average_f1_val = np.mean(list_f1_scores_val)
average_recall_val = np.mean(list_recall_scores_val)
average_precision_val = np.mean(list_precision_scores_val)






In [None]:
print(f'F1 train: {average_f1_train} F1 val: {average_f1_val}')
plot_validation_scores(average_f1_val, average_recall_val, average_precision_val)

<h2>Experiments with the CountVectorizer</h2>
<p>Since we don't acieve high f1 score with the previous experiments we will try to use the CountVectorizer to see what scores we will achieve</p>

In [None]:
count_vec = CountVectorizer(max_features=4000)
X_train_counts = count_vec.fit_transform(df_train_set['Text'])
X_test_counts = count_vec.transform(df_test_set['Text'])
X_valid_counts = count_vec.transform(df_valid_set['Text'])

In [None]:

X_combined = np.vstack((X_train_counts.toarray(), X_valid_counts.toarray()))
y_combined = np.hstack((Y_train, Y_val))

In [None]:
best_model = LogisticRegression(multi_class=best_multiclass_val, C=best_C, solver=best_solver, penalty=best_penalty, max_iter=best_iter)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.33, random_state=42)

In [None]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)


f1 = f1_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
precision = precision_score(y_test, predictions, average='macro', zero_division=0)

In [None]:
print(f'f1: {f1} Recall: {recall} Precision: {precision}')

<h1>Make predictions</h1>

In [None]:
X_combined = np.vstack((X_train_tf.toarray(), X_valid_tf.toarray()))
y_combined = np.hstack((Y_train, Y_val))
best_model = LogisticRegression(multi_class=best_multiclass_val, C=best_C, solver=best_solver, penalty=best_penalty, max_iter=best_iter)

In [None]:
best_model.fit(X_combined, y_combined)

In [None]:
predictions = best_model.predict(X_test_tf)
predictions

In [None]:
categorical_predictions = le.inverse_transform(predictions)

results = pd.DataFrame({
    'Id': df_test_set['New_ID'],
    'Predicted': categorical_predictions
})

results.to_csv('submission.csv', index=False)
results