In [None]:
conda list --export --explicit > requirements.txt

In [None]:
# Importing necessary packages

import numpy as np
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier #A variant regression for classification tasks!
from sklearn.naive_bayes import GaussianNB as NaiveBayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import re
import string
import pickle
from string import punctuation
from scipy.stats import chi2_contingency
from wordcloud import WordCloud

from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from joblib import dump, load

In [None]:
# Task 1: Loading Dataset
data = pd.read_csv('21204829.csv')

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
# Finding Null values in each column
data.isna().sum()

In [None]:
# Dropping rows with missing values
data.dropna(inplace=True)

In [None]:
# Dropping the unnamed column as it holds no significance in analysing the data
data = data.drop(["Unnamed: 0"], axis=1)

In [None]:
data.columns

In [None]:
# top 2 Categories of news in our datset
# https://www.kaggle.com/code/shivamburnwal/news-articles-classification
data.category.value_counts()[:2]

In [None]:
# https://www.kaggle.com/code/shivamburnwal/news-articles-classification
# our unique labels of text which are to be classified.
# Check the unique values in the 'category' column
unique_categories = data['category'].dropna().unique()
print("Unique Categories:")
print(unique_categories)

In [None]:
# Creating a contingency table to find the p-values of 'category' - 'headline' and 'category' - 'authors'
contingency_table1 = pd.crosstab(data['category'], data['headline'])
contingency_table2 = pd.crosstab(data['category'], data['authors'])

# Performing chi-square test of independence for 'category' and 'headline'
chi2, p_value, _, _ = chi2_contingency(contingency_table1)

# Printing the chi-square statistic and p-value
print("Chi-square statistic for 'category' and 'headline':", chi2)
print("p-value for Contingency Table 1:", p_value)

# Performing chi-square test of independence for 'category' and 'author'
chi2, p_value, _, _ = chi2_contingency(contingency_table2)

# Printing the chi-square statistic and p-value
print("Chi-square statistic for 'category' and 'author':", chi2)
print("p-value for Contingency Table 2:", p_value)

# Exploring author-specific insights
top_authors = data['authors'].value_counts().head(10)

for author in top_authors.index:
    author_categories = data[data['authors'] == author]['category'].unique()
    print(f"Author: {author}, Categories: {author_categories}")
    

Based on the analysis, there is no strong evidence to conclude that the category of an article and its headline 
are associated or dependent on each other. The p-value of 0.4695 indicates that there is a 46.95% probability of 
observing such a large chi-square statistic under the assumption that the category and headline are independent.

The significant result suggests that the category of the article and author are not independent and that changes 
or patterns in one variable are associated with changes or patterns in the other variable.

The category and headline variables are compared using the chi-square test of independence to determine whether there is a statistically significant correlation. If the two variables are truly independent, the p-value represents the likelihood of getting the observed outcomes (or more severe ones).
We do not have enough data to reject the null hypothesis that there is no correlation between the 'category' and 'headline' variables because the p-value of 0.4760503855701773 is bigger than the commonly accepted significance level of 0.05. In this dataset, the "category" and "headline" so appear to be independent.

In [None]:
# Function to get the most common terms for each category
categories = data['category'].unique()
for category in categories:
    # Filtering the dataframe by category
    category_df = data[data['category'] == category]
    
    # Creating a count vectorizer
    vectorizer = CountVectorizer()
    
    # Fit and transform the preprocessed text
    count_matrix = vectorizer.fit_transform(category_df['short_description'])
    
    # Getting the feature names (terms)
    terms = vectorizer.get_feature_names()
    
    # Calculating the term frequencies
    term_frequencies = count_matrix.sum(axis=0)
    
    # Sorting the terms by frequency
    sorted_terms = sorted(zip(terms, term_frequencies.tolist()[0]), key=lambda x: x[1], reverse=True)
    
    # Printing the most common terms for the category
    print(f"Most common terms for category before preprocessing: {category}")
    for term, frequency in sorted_terms[:10]:
        print(f"{term}: {frequency}")
    print()


# Preprocessing
STOPWORDS = set(stopwords.words('english'))
punctuation = list(string.punctuation)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        # Converting text to lowercase
        text = text.lower()

        # Removing punctuation
        text = ''.join([c for c in text if c not in punctuation])

        # Tokenizing text
        tokens = text.split()

        # Removing stopwords and perform lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in STOPWORDS]

        # Joining tokens back to text
        text = ' '.join(tokens)
    else:
        text = ""
    
    return text

# Applying preprocessing to the 'short_description' column
data['preprocessed_text'] = data['short_description'].apply(preprocess_text)

# Getting the most common terms for each category
categories = data['category'].unique()
for category in categories:
    # Filtering the dataframe by category
    category_df = data[data['category'] == category]
    
    # Creating a count vectorizer
    vectorizer = CountVectorizer()
    
    # Fit and transform the preprocessed text
    count_matrix = vectorizer.fit_transform(category_df['preprocessed_text'])
    
    # Getting the feature names (terms)
    terms = vectorizer.get_feature_names()
    
    # Calculating the term frequencies
    term_frequencies = count_matrix.sum(axis=0)
    
    # Sorting the terms by frequency
    sorted_terms = sorted(zip(terms, term_frequencies.tolist()[0]), key=lambda x: x[1], reverse=True)
    
    # Printing the most common terms for the category
    print(f"Most common terms for category after preprocessing: {category}")
    for term, frequency in sorted_terms[:10]:
        print(f"{term}: {frequency}")
    print()


In [None]:
# Replacing 'df' with the actual name of your DataFrame
preprocessed_headlines = data['headline'].apply(preprocess_text)

# Creating the document-word matrix
vectorizer = CountVectorizer(max_features=1000)  # Adjust the max_features parameter as needed
document_word_matrix = vectorizer.fit_transform(preprocessed_headlines)

# Applying Latent Dirichlet Allocation (LDA)
num_topics = 5  # Specify the number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(document_word_matrix)

# Getting the top words for each topic
feature_names = vectorizer.get_feature_names()
top_words_per_topic = []
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    top_words_per_topic.append(top_words)

# Visualizing the topics using word clouds
plt.figure(figsize=(12, 8))
for topic_idx, top_words in enumerate(top_words_per_topic):
    wordcloud = WordCloud(background_color='white').generate(' '.join(top_words))
    plt.subplot(2, 3, topic_idx+1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Topic ' + str(topic_idx+1))
    plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
# 'link' and 'date' columns are removed
df = data
del df['link']
del df['date']
df.columns

In [None]:
# Calculating sentence length for each row
data['sentence_length'] = data['preprocessed_text'].str.split().apply(lambda x: len(x))

# Grouping the data by category and calculate mean and standard deviation of sentence length
category_stats = data.groupby('category')['sentence_length'].agg(['mean', 'std'])

# Print the statistics for each category
for category, stats in category_stats.iterrows():
    print(f"Category: {category}")
    print(f"Mean sentence length: {stats['mean']:.2f}")
    print(f"Standard deviation of sentence length: {stats['std']:.2f}")
    print()

In [None]:
# Checking for outliers in sentence length
q1 = data['sentence_length'].quantile(0.25)
q3 = data['sentence_length'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[(data['sentence_length'] < lower_bound) | (data['sentence_length'] > upper_bound)]
print("Outliers in Sentence Length:")
print(outliers)

In [None]:
# Plotting boxplot for each category
plt.figure(figsize=(12, 6))
sns.boxplot(x='category', y='sentence_length', data=df)
plt.title('Outliers in Sentence Length by Category')
plt.xlabel('Category')
plt.ylabel('Sentence Length')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Removing data based on sentence length might bias the dataset and affect the analysis of the relationship between 
# sentence length and the target variable (category).
# Data Distribution: Analysing the distribution of sentence lengths across categories to 
# assess if there are any significant differences
# Calculate sentence lengths for each category
category_lengths = {}
for category in df['category'].unique():
    sentences = df[df['category'] == category]['preprocessed_text']
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    category_lengths[category] = sentence_lengths

# Plot the distribution of sentence lengths for each category
plt.figure(figsize=(10, 6))
for category, lengths in category_lengths.items():
    plt.hist(lengths, bins=20, alpha=0.5, label=category)
plt.xlabel('Sentence Length')
plt.ylabel('Count')
plt.title('Distribution of Sentence Lengths by Category')
plt.legend()
plt.show()

In [None]:
# Data Balancing
# Calculating the initial distribution of categories
initial_distribution = df['category'].value_counts()

# Removing data based on sentence length
max_sentence_length = np.mean(df['sentence_length'])  # Define the maximum sentence length threshold
df_filtered = df[df['sentence_length'] <= max_sentence_length]

# Calculating the updated distribution of categories
updated_distribution = df_filtered['category'].value_counts()

# Visualizing the initial and updated distributions
plt.figure(figsize=(12, 6))
plt.subplot(121)
initial_distribution.plot(kind='bar', color='blue')
plt.title('Initial Distribution')
plt.xlabel('Category')
plt.ylabel('Count')

plt.subplot(122)
updated_distribution.plot(kind='bar', color='green')
plt.title('Updated Distribution')
plt.xlabel('Category')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Comparing the distributions and assess the impact
print("Initial Distribution:\n", initial_distribution)
print("\nUpdated Distribution:\n", updated_distribution)

# Calculating sentence lengths for each category
category_lengths = {}
for category in df_filtered['category'].unique():
    sentences = df_filtered[df_filtered['category'] == category]['preprocessed_text']
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    category_lengths[category] = sentence_lengths

# Plotting the distribution of sentence lengths for each category
plt.figure(figsize=(10, 6))
for category, lengths in category_lengths.items():
    plt.hist(lengths, bins=20, alpha=0.5, label=category)
plt.xlabel('Sentence Length')
plt.ylabel('Count')
plt.title('Distribution of Sentence Lengths by Category')
plt.legend()
plt.show()

In [None]:
# Removing unnecessary Columns from the data frame 'df_filtered'
del df_filtered['short_description']
del df_filtered['sentence_length']
df.columns

In [None]:
# Assigning values to X and y
X = df_filtered.preprocessed_text
y = df_filtered.category

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Splitting the train set further into train and valid sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Creating DataFrames for the train, valid, and test sets
train_data = pd.DataFrame({'text': X_train, 'label': y_train})
valid_data = pd.DataFrame({'text': X_valid, 'label': y_valid})
test_data = pd.DataFrame({'text': X_test, 'label': y_test})

# Save the train, valid, and test data to separate CSV files
train_data.to_csv('train.csv', index=False)
valid_data.to_csv('valid.csv', index=False)
test_data.to_csv('test.csv', index=False)

# Printing the shapes of the train, valid, and test sets
print(f"Train set shape: {train_data.shape}")
print(f"Valid set shape: {valid_data.shape}")
print(f"Test set shape: {test_data.shape}")


In [None]:
# Checking if each columns have null values
df_filtered.isna().sum()

In [None]:
# Filling missing values with empty strings
train_data['text'] = train_data['text'].fillna('')
valid_data['text'] = valid_data['text'].fillna('')

# Converting text into a numeric representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
X_valid = vectorizer.transform(valid_data['text'])

# Getting the labels
y_train = train_data['label']
y_valid = valid_data['label']

train_data.head()

In [None]:
# Defining the models
models = [('Logistic Regression', LogisticRegression(max_iter=500)),
          ('Random Forest', RandomForestClassifier()),
          ('Linear SVC', LinearSVC()),
          ('Multinomial NaiveBayes', MultinomialNB()),
          ('SGD Classifier', SGDClassifier())]

names = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iterate over each model
for name, clf in models:
    # Create a pipeline
    pipe = Pipeline([
        ('clf', clf),
    ])
    
    # Fit the pipeline on the training data
    pipe.fit(X_train, y_train)
    
    # Predict on the validation data
    y_pred = pipe.predict(X_valid)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, pos_label='QUEER VOICES')
    recall = recall_score(y_valid, y_pred, pos_label='QUEER VOICES')
    f1 = f1_score(y_valid, y_pred, pos_label='QUEER VOICES')  # Calculate the F1 score
    
    # Store the model and scores
    names.append(name)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    
    # Save the model
    filename = f'{name}.joblib'
    dump(pipe, filename)
    
    # Calculate the confusion matrix
    plot_confusion_matrix(pipe, X_valid, y_valid, cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

# Create a DataFrame with the scores
scores_df = pd.DataFrame({
    'Model': names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores
})

# Print the DataFrame
print(scores_df)



In [None]:
# https://www.learndatasci.com/glossary/binary-classification/
# Plot showing Accuracy, Precision and Recall scores of all the 5 binary classifiers
ax = scores_df.plot.barh(x='Model', y=['Accuracy', 'Precision', 'Recall'], stacked=True)
ax.legend(ncol=len(models), bbox_to_anchor=(0, 1), loc='lower left', prop={'size': 14})
plt.xlabel('Score')
plt.ylabel('Model')
plt.title('Model Performance')
plt.tight_layout()
plt.show()

In [None]:
# Supervised CNN Model and evaluating the accuracy and F1-score
# Loading the train and validation datasets
train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")

# Dropping rows with missing values
train_data = train_data.dropna()
valid_data = valid_data.dropna()

# Extracting the text and label columns
train_text = train_data["text"].tolist()
train_labels = train_data["label"].tolist()
valid_text = valid_data["text"].tolist()
valid_labels = valid_data["label"].tolist()

# Encoding labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
valid_labels = label_encoder.transform(valid_labels)

# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_sequences = tokenizer.texts_to_sequences(train_text)
valid_sequences = tokenizer.texts_to_sequences(valid_text)

# Padding sequences to have the same length
max_sequence_length = max(len(seq) for seq in train_sequences)
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
valid_sequences = pad_sequences(valid_sequences, maxlen=max_sequence_length)

# Creating the CNN model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(valid_sequences, valid_labels))

# Saving the model
model.save("cnn_model.h5")

# Evaluating the model
valid_pred = model.predict(valid_sequences)
valid_pred_labels = np.argmax(valid_pred, axis=1)

accuracy = accuracy_score(valid_labels, valid_pred_labels)
precision = precision_score(valid_labels, valid_pred_labels, average='weighted')
recall = recall_score(valid_labels, valid_pred_labels, average='weighted')
f1 = f1_score(valid_labels, valid_pred_labels, average='weighted')

# Printing the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
# Using the saved CNN model to evaluate its performance on train and valid datasets

# Loading the saved model
loaded_model = load_model("cnn_model.h5")

# Evaluating the loaded model on train data
train_pred = loaded_model.predict(train_sequences)
train_pred_labels = np.argmax(train_pred, axis=1)

train_accuracy = accuracy_score(train_labels, train_pred_labels)
train_precision = precision_score(train_labels, train_pred_labels, average='weighted')
train_recall = recall_score(train_labels, train_pred_labels, average='weighted')
train_f1 = f1_score(train_labels, train_pred_labels, average='weighted')

# Evaluating the loaded model on valid data
valid_pred = loaded_model.predict(valid_sequences)
valid_pred_labels = np.argmax(valid_pred, axis=1)

valid_accuracy = accuracy_score(valid_labels, valid_pred_labels)
valid_precision = precision_score(valid_labels, valid_pred_labels, average='weighted')
valid_recall = recall_score(valid_labels, valid_pred_labels, average='weighted')
valid_f1 = f1_score(valid_labels, valid_pred_labels, average='weighted')

# Printing the evaluation metrics for train and valid data
print("Train Data Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")
print()
print("Valid Data Metrics:")
print(f"Accuracy: {valid_accuracy:.4f}")
print(f"Precision: {valid_precision:.4f}")
print(f"Recall: {valid_recall:.4f}")
print(f"F1 Score: {valid_f1:.4f}")


In [None]:
# Evaluating the binary classifiers for accuracy and F1-scores on train and valitaion data
# Fit the vectorizer on the entire train data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the train and validation text data using the vectorizer
valid_vectors = vectorizer.transform(valid_data['text'])

# Convert the sparse matrix to a dense matrix
valid_vectors = valid_vectors.toarray()

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Check for any extra features in the validation data
extra_features = set(feature_names) - set(vectorizer.get_feature_names())

if extra_features:
    # Filter out the extra features from the validation vectors
    valid_vectors = valid_vectors[:, [i for i, feature_name in enumerate(feature_names) if feature_name not in extra_features]]

# Perform padding on the input vectors
max_sequence_length = train_vectors.shape[1]
valid_vectors = pad_sequences(valid_vectors, maxlen=max_sequence_length)    
# train_vectors = pad_sequences(train_vectors, maxlen=max_sequence_length)    


model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'Multinomial NaiveBayes', 'SGD Classifier']
file_names = ['Logistic Regression.joblib', 'Random Forest.joblib', 'Linear SVC.joblib', 'Multinomial NaiveBayes.joblib', 'SGD Classifier.joblib']
    
# Load and evaluate each saved model
for model_name, file_name in zip(model_names, file_names):
    # Load the model
    if file_name.endswith('.joblib'):
        model = load(file_name)
    elif file_name.endswith('.h5'):
        model = load_model(file_name)
    else:
        raise ValueError(f"Unsupported file format: {file_name}")

    # Make predictions on the train and validation datasets
    train_pred = model.predict(train_vectors)
    valid_pred = model.predict(valid_vectors)

    # Calculate evaluation metrics for the train dataset
    accuracy_train = accuracy_score(train_data['label'], train_pred)
    precision_train = precision_score(train_data['label'], train_pred, pos_label='QUEER VOICES')
    recall_train = recall_score(train_data['label'], train_pred, pos_label='QUEER VOICES')
    f1_train = f1_score(train_data['label'], train_pred, pos_label='QUEER VOICES')

    # Calculate evaluation metrics for the validation dataset
    accuracy_valid = accuracy_score(valid_data['label'], valid_pred)
    precision_valid = precision_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')
    recall_valid = recall_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')
    f1_valid = f1_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')

    # Print the evaluation metrics for the train and validation datasets
    print(f"Train Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_train:.4f}")
    print(f"Precision: {precision_train:.4f}")
    print(f"Recall: {recall_train:.4f}")
    print(f"F1 Score: {f1_train:.4f}")
    print()
    print(f"Validation Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_valid:.4f}")
    print(f"Precision: {precision_valid:.4f}")
    print(f"Recall: {recall_valid:.4f}")
    print(f"F1 Score: {f1_valid:.4f}")
    print()


In [None]:
# Loading the saved CNN model and evaluating for its performance (acuuracy and F1-score) on train and validation data
# Load the saved model
loaded_model = load_model("cnn_model.h5")

# Evaluate the loaded model on train data
train_pred = loaded_model.predict(train_sequences)
train_pred_labels = np.argmax(train_pred, axis=1)

train_accuracy = accuracy_score(train_labels, train_pred_labels)
train_precision = precision_score(train_labels, train_pred_labels, average='weighted')
train_recall = recall_score(train_labels, train_pred_labels, average='weighted')
train_f1 = f1_score(train_labels, train_pred_labels, average='weighted')

# Evaluate the loaded model on valid data
valid_pred = loaded_model.predict(valid_sequences)
valid_pred_labels = np.argmax(valid_pred, axis=1)

valid_accuracy = accuracy_score(valid_labels, valid_pred_labels)
valid_precision = precision_score(valid_labels, valid_pred_labels, average='weighted')
valid_recall = recall_score(valid_labels, valid_pred_labels, average='weighted')
valid_f1 = f1_score(valid_labels, valid_pred_labels, average='weighted')

# Print the evaluation metrics for train and valid data
print("Train Data Metrics for CNN:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")
print()
print("Valid Data Metrics for CNN:")
print(f"Accuracy: {valid_accuracy:.4f}")
print(f"Precision: {valid_precision:.4f}")
print(f"Recall: {valid_recall:.4f}")
print(f"F1 Score: {valid_f1:.4f}")



Looking at the performance metrics on the train and validation sets, we can observe the following:

Logistic Regression:

Train Accuracy: 0.9576, Validation Accuracy: 0.6381
Train F1 Score: 0.9694, Validation F1 Score: 0.7751
The accuracy, precision, recall, and F1 score on the train set are higher compared to the validation set. This indicates that the model is slightly overfitting the train data, as the performance drops slightly on the validation set. However, the drop in performance is relatively small, suggesting that the model generalizes well.

Random Forest:

Train Accuracy: 0.7989, Validation Accuracy: 0.6349
Train F1 Score: 0.8699, Validation F1 Score: 0.7754
Similar to Logistic Regression, the Random Forest model shows higher accuracy, precision, recall, and F1 score on the train set compared to the validation set. This indicates a slight overfitting of the train data, but the drop in performance on the validation set is not significant.

Linear SVC:

Train Accuracy: 0.9939, Validation Accuracy: 0.6397
Train F1 Score: 0.9955, Validation F1 Score: 0.7754
The Linear SVC model demonstrates very high performance on both the train and validation sets. The metrics on the train set are nearly identical to those on the validation set, indicating that the model generalizes well and does not suffer from overfitting. It achieves high accuracy, precision, recall, and F1 score on both datasets.

Multinomial NaiveBayes:

Train Accuracy: 0.9540, Validation Accuracy: 0.6365
Train F1 Score: 0.9666, Validation F1 Score: 0.7739
Similar to the previous models, Multinomial NaiveBayes exhibits slightly higher performance on the train set compared to the validation set. The drop in performance on the validation set is relatively small, indicating reasonable generalization capabilities. The model achieves good accuracy, precision, recall, and F1 score on both datasets.

SGD Classifier:

Train Accuracy: 0.9851, Validation Accuracy: 0.6414
Train F1 Score: 0.9890, Validation F1 Score: 0.7767
The SGD Classifier model performs exceptionally well on both the train and validation sets. It achieves high accuracy, precision, recall, and F1 score on both datasets. The metrics on the train set are almost identical to those on the validation set, suggesting excellent generalization.

CNN:

Train Accuracy: 0.9988, Validation Accuracy: 0.7964
Train F1 Score: 0.9988, Validation F1 Score: 0.7944
The CNN model exhibits a slight decrease in performance when transitioning from the train set to the validation set. It achieves a train accuracy of 0.9988 and a validation accuracy of 0.7964, as well as a train F1 score of 0.9988 and a validation F1 score of 0.7944. Although the model maintains a high level of accuracy and F1 score on the train set, it encounters difficulties in generalizing to unseen data, resulting in slightly lower performance on the validation set. This suggests that the model might be overfitting the train data, emphasizing specific patterns and noise that are not representative of the overall dataset. Adjustments such as regularization techniques or fine-tuning the model architecture could help improve its ability to generalize and enhance performance on unseen data.

In summary, the Logistic Regression, Linear SVC, Multinomial NaiveBayes, SGD Classifier models perform well and show good generalization, with similar performance on both the training and validation sets. Random Forest also performs reasonably well but exhibits a slight drop in performance on the validation set. However, the CNN model exhibits a slight decrease in performance when transitioning from the train set to the validation set. It maintains high accuracy and F1 score on the train set but encounters difficulties in generalizing to unseen data, resulting in slightly lower performance on the validation set. This indicates a potential overfitting of the train data and suggests the need for adjustments to improve generalization.

Overall, the Linear SVC and SGD Classifier models showcase the best generalization capabilities, while the CNN model requires further fine-tuning to enhance its ability to generalize and perform well on unseen data.

In [None]:
# Error analysis of the binary classifiers and finding the misclassified sentences in train and valid data
# Fit the vectorizer on the entire train data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the train and validation text data using the vectorizer
valid_vectors = vectorizer.transform(valid_data['text'])

# Converting the sparse matrix to a dense matrix
valid_vectors = valid_vectors.toarray()

# Getting the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Checking for any extra features in the validation data
extra_features = set(feature_names) - set(vectorizer.get_feature_names())

if extra_features:
    # Filtering out the extra features from the validation vectors
    valid_vectors = valid_vectors[:, [i for i, feature_name in enumerate(feature_names) if feature_name not in extra_features]]

# Performing padding on the input vectors
max_sequence_length = train_vectors.shape[1]
valid_vectors = pad_sequences(valid_vectors, maxlen=max_sequence_length)

model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'Multinomial NaiveBayes', 'SGD Classifier']
file_names = ['Logistic Regression.joblib', 'Random Forest.joblib', 'Linear SVC.joblib', 'Multinomial NaiveBayes.joblib', 'SGD Classifier.joblib']

# Loading and evaluating each saved model
for model_name, file_name in zip(model_names, file_names):
    # Load the model
    if file_name.endswith('.joblib'):
        model = load(file_name)
    elif file_name.endswith('.h5'):
        model = load_model(file_name)
    else:
        raise ValueError(f"Unsupported file format: {file_name}")

    # Making predictions on the train and validation datasets
    train_pred = model.predict(train_vectors)
    valid_pred = model.predict(valid_vectors)

    # Performing error analysis
    train_errors = train_data[train_data['label'] != train_pred]
    valid_errors = valid_data[valid_data['label'] != valid_pred]

    print(f"Error Analysis for {model_name}:")
    print("Train Errors:")
    print(train_errors)
    print()

    print("Validation Errors:")
    print(valid_errors)
    print()

    print("-------------------------")


The error analysis conducted on Linear Regression, Random Forest, Linear SVC, Naive Bayes and SGD Classifier models reveals the misclassified examples for each model and provides insights into the patterns of misclassification. Let's examine if the different models classified the same sentences incorrectly and summarize the findings:

From the error analysis, it appears that the different models classified some sentences incorrectly in common. Here are the observations:

The sentences misclassified by multiple models are primarily from the "TASTE" label. Some examples of these common misclassifications include sentences like "get ready," "weirdest debate 2017," and "okay lady let's get formation." These sentences seem to have ambiguous or context-dependent meanings, making them challenging to classify correctly.

The misclassifications are more prevalent in the "TASTE" label, indicating that the models struggle to distinguish between different topics related to food, recipes, and flavors.

The models tend to misclassify shorter or less informative sentences, as seen in examples like "bottom," "huge," or "look." These sentences lack specific context or clear indicators for classification, leading to errors.

It is interesting to note that the misclassified examples have varying degrees of severity. Some sentences could be considered subjective or open to interpretation, making it challenging even for human annotators to assign a definitive label.

Overall, the analysis suggests that the models face difficulty in accurately classifying certain sentences related to food, flavors, and subjective topics. Improving the models' performance in distinguishing such nuanced and context-dependent sentences would require additional training data and more fine-tuning of the models' parameters.

It is important to continue analyzing and refining the models to address these common misclassifications and enhance their overall accuracy and robustness.

In [None]:
# Error analysis of the CNN Model and finding the misclassified sentences in train and valid data
# Load the saved model
loaded_model = load_model("cnn_model.h5")

# Get the predicted labels for train and valid datasets
train_pred_labels = np.argmax(loaded_model.predict(train_sequences), axis=1)
valid_pred_labels = np.argmax(loaded_model.predict(valid_sequences), axis=1)

# Get the original labels for train and valid datasets
train_original_labels = train_labels
valid_original_labels = valid_labels

# Create a DataFrame to store the results for train dataset
train_error_analysis_df = pd.DataFrame({"Text": train_text, "Original Label": train_original_labels, "Predicted Label": train_pred_labels})

# Create a DataFrame to store the results for valid dataset
valid_error_analysis_df = pd.DataFrame({"Text": valid_text, "Original Label": valid_original_labels, "Predicted Label": valid_pred_labels})

# Filter the DataFrames to get misclassified examples
train_misclassified_df = train_error_analysis_df[train_error_analysis_df["Original Label"] != train_error_analysis_df["Predicted Label"]]
valid_misclassified_df = valid_error_analysis_df[valid_error_analysis_df["Original Label"] != valid_error_analysis_df["Predicted Label"]]

# Print the misclassified examples for train dataset
print("Misclassified Examples in Train Dataset:")
print(train_misclassified_df)

# Print the misclassified examples for valid dataset
print("Misclassified Examples in Valid Dataset:")
print(valid_misclassified_df)


From the error analysis for the CNN model, it is evident that the misclassifications are primarily occurring in the "QUEER VOICES" label. However, it is important to note that the CNN model has misclassified a significant number of sentences compared to the other models. Here are the observations:

The CNN model misclassifies a large number of sentences from the "QUEER VOICES" label in both the train and validation sets. These sentences contain various topics related to LGBTQ+ issues, such as discussions on political figures, personal experiences, societal challenges, and the LGBTQ+ community.

Unlike the other models, the CNN model does not misclassify any sentences as "TASTE" in both the train and validation sets. This indicates that the CNN model's misclassifications are specific to the "QUEER VOICES" label and not related to food or flavors.

The confusion matrix for the CNN model shows that it misclassifies all the examples in the "QUEER VOICES" label, while correctly classifying all the examples in the "TASTE" label. This indicates a significant bias in the model's predictions towards the "TASTE" label.

In comparison to the other models, the CNN model performs poorly in terms of misclassifications, especially for the "QUEER VOICES" label. This suggests that the CNN model is not effectively capturing the patterns and nuances of LGBTQ+ content in the dataset. It may require further training, fine-tuning, or architectural adjustments to improve its performance and reduce bias.

Overall, the analysis highlights the challenges faced by the CNN model in accurately classifying sentences related to LGBTQ+ topics. Addressing these misclassifications would be crucial for improving the model's ability to capture the nuances and complexities of the "QUEER VOICES" label

In [None]:
# Making changes to the Binary Classifiers by changing its parameters and saving the updated models

# Fit the vectorizer on the entire train data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the train and validation text data using the vectorizer
valid_vectors = vectorizer.transform(valid_data['text'])

# Convert the sparse matrix to a dense matrix
valid_vectors = valid_vectors.toarray()

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Check for any extra features in the validation data
extra_features = set(feature_names) - set(vectorizer.get_feature_names())

if extra_features:
    # Filter out the extra features from the validation vectors
    valid_vectors = valid_vectors[:, [i for i, feature_name in enumerate(feature_names) if feature_name not in extra_features]]

# Perform padding on the input vectors
max_sequence_length = train_vectors.shape[1]
valid_vectors = pad_sequences(valid_vectors, maxlen=max_sequence_length)

# Apply changes to each classifier/model and re-evaluate

# Logistic Regression
logreg_model = LogisticRegression(class_weight='balanced')
logreg_model.fit(train_vectors, train_data['label'])
logreg_pred_train = logreg_model.predict(train_vectors)
logreg_pred_valid = logreg_model.predict(valid_vectors)
logreg_accuracy_train = accuracy_score(train_data['label'], logreg_pred_train)
logreg_accuracy_valid = accuracy_score(valid_data['label'], logreg_pred_valid)
logreg_precision_train = precision_score(train_data['label'], logreg_pred_train, pos_label='QUEER VOICES')
logreg_precision_valid = precision_score(valid_data['label'], logreg_pred_valid, pos_label='QUEER VOICES')
logreg_recall_train = recall_score(train_data['label'], logreg_pred_train, pos_label='QUEER VOICES')
logreg_recall_valid = recall_score(valid_data['label'], logreg_pred_valid, pos_label='QUEER VOICES')
logreg_f1_train = f1_score(train_data['label'], logreg_pred_train, pos_label='QUEER VOICES')
logreg_f1_valid = f1_score(valid_data['label'], logreg_pred_valid, pos_label='QUEER VOICES')

# Random Forest
random_forest_model = RandomForestClassifier(n_estimators=100)
random_forest_model.fit(train_vectors, train_data['label'])
random_forest_pred_train = random_forest_model.predict(train_vectors)
random_forest_pred_valid = random_forest_model.predict(valid_vectors)
random_forest_accuracy_train = accuracy_score(train_data['label'], random_forest_pred_train)
random_forest_accuracy_valid = accuracy_score(valid_data['label'], random_forest_pred_valid)
random_forest_precision_train = precision_score(train_data['label'], random_forest_pred_train, pos_label='QUEER VOICES')
random_forest_precision_valid = precision_score(valid_data['label'], random_forest_pred_valid, pos_label='QUEER VOICES')
random_forest_recall_train = recall_score(train_data['label'], random_forest_pred_train, pos_label='QUEER VOICES')
random_forest_recall_valid = recall_score(valid_data['label'], random_forest_pred_valid, pos_label='QUEER VOICES')
random_forest_f1_train = f1_score(train_data['label'], random_forest_pred_train, pos_label='QUEER VOICES')
random_forest_f1_valid = f1_score(valid_data['label'], random_forest_pred_valid, pos_label='QUEER VOICES')

# Linear SVC
linear_svc_model = LinearSVC(class_weight='balanced')
linear_svc_model.fit(train_vectors, train_data['label'])
linear_svc_pred_train = linear_svc_model.predict(train_vectors)
linear_svc_pred_valid = linear_svc_model.predict(valid_vectors)
linear_svc_accuracy_train = accuracy_score(train_data['label'], linear_svc_pred_train)
linear_svc_accuracy_valid = accuracy_score(valid_data['label'], linear_svc_pred_valid)
linear_svc_precision_train = precision_score(train_data['label'], linear_svc_pred_train, pos_label='QUEER VOICES')
linear_svc_precision_valid = precision_score(valid_data['label'], linear_svc_pred_valid, pos_label='QUEER VOICES')
linear_svc_recall_train = recall_score(train_data['label'], linear_svc_pred_train, pos_label='QUEER VOICES')
linear_svc_recall_valid = recall_score(valid_data['label'], linear_svc_pred_valid, pos_label='QUEER VOICES')
linear_svc_f1_train = f1_score(train_data['label'], linear_svc_pred_train, pos_label='QUEER VOICES')
linear_svc_f1_valid = f1_score(valid_data['label'], linear_svc_pred_valid, pos_label='QUEER VOICES')

# SGD Classifier
sgd_classifier_model = SGDClassifier(class_weight='balanced')
sgd_classifier_model.fit(train_vectors, train_data['label'])
sgd_classifier_pred_train = sgd_classifier_model.predict(train_vectors)
sgd_classifier_pred_valid = sgd_classifier_model.predict(valid_vectors)
sgd_classifier_accuracy_train = accuracy_score(train_data['label'], sgd_classifier_pred_train)
sgd_classifier_accuracy_valid = accuracy_score(valid_data['label'], sgd_classifier_pred_valid)
sgd_classifier_precision_train = precision_score(train_data['label'], sgd_classifier_pred_train, pos_label='QUEER VOICES')
sgd_classifier_precision_valid = precision_score(valid_data['label'], sgd_classifier_pred_valid, pos_label='QUEER VOICES')
sgd_classifier_recall_train = recall_score(train_data['label'], sgd_classifier_pred_train, pos_label='QUEER VOICES')
sgd_classifier_recall_valid = recall_score(valid_data['label'], sgd_classifier_pred_valid, pos_label='QUEER VOICES')
sgd_classifier_f1_train = f1_score(train_data['label'], sgd_classifier_pred_train, pos_label='QUEER VOICES')
sgd_classifier_f1_valid = f1_score(valid_data['label'], sgd_classifier_pred_valid, pos_label='QUEER VOICES')

# Save the updated models
dump(logreg_model, 'LogisticRegression_updated.joblib')
dump(random_forest_model, 'RandomForest_updated.joblib')
dump(linear_svc_model, 'LinearSVC_updated.joblib')
dump(sgd_classifier_model, 'SGDClassifier_updated.joblib')

In [None]:
# Performance evaluation of all the saved updated binary classifiers on train and valid datasets

# Fit the vectorizer on the entire train data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the train and validation text data using the vectorizer
valid_vectors = vectorizer.transform(valid_data['text'])

# Convert the sparse matrix to a dense matrix
valid_vectors = valid_vectors.toarray()

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Check for any extra features in the validation data
extra_features = set(feature_names) - set(vectorizer.get_feature_names())

if extra_features:
    # Filter out the extra features from the validation vectors
    valid_vectors = valid_vectors[:, [i for i, feature_name in enumerate(feature_names) if feature_name not in extra_features]]

# Perform padding on the input vectors
max_sequence_length = train_vectors.shape[1]
valid_vectors = pad_sequences(valid_vectors, maxlen=max_sequence_length)

# List of model names and corresponding file names
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'SGD Classifier']
file_names = ['LogisticRegression_updated.joblib', 'RandomForest_updated.joblib', 'LinearSVC_updated.joblib', 'SGDClassifier_updated.joblib']

# Load and evaluate each saved model
for model_name, file_name in zip(model_names, file_names):
    # Load the model
    model = load(file_name)

    # Make predictions on the train and validation datasets
    train_pred = model.predict(train_vectors)
    valid_pred = model.predict(valid_vectors)

    # Calculate evaluation metrics for the train dataset
    accuracy_train = accuracy_score(train_data['label'], train_pred)
    precision_train = precision_score(train_data['label'], train_pred, pos_label='QUEER VOICES')
    recall_train = recall_score(train_data['label'], train_pred, pos_label='QUEER VOICES')
    f1_train = f1_score(train_data['label'], train_pred, pos_label='QUEER VOICES')

    # Calculate evaluation metrics for the validation dataset
    accuracy_valid = accuracy_score(valid_data['label'], valid_pred)
    precision_valid = precision_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')
    recall_valid = recall_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')
    f1_valid = f1_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')

    # Print the evaluation metrics for both the training and validation datasets
    print(f"Metrics for {model_name}:")
    print(f"Train - Accuracy: {accuracy_train:.4f}, Precision: {precision_train:.4f}, Recall: {recall_train:.4f}, F1 Score: {f1_train:.4f}")
    print(f"Validation - Accuracy: {accuracy_valid:.4f}, Precision: {precision_valid:.4f}, Recall: {recall_valid:.4f}, F1 Score: {f1_valid:.4f}")
    print()


In [None]:
# Performance evaluation of the updated and saved Linear SVC and SGD Classifier models
# The performance of the two models are comparitively better than other binary classifiers

# Fit the vectorizer on the entire train data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the train and validation text data using the vectorizer
valid_vectors = vectorizer.transform(valid_data['text'])

# Convert the sparse matrix to a dense matrix
valid_vectors = valid_vectors.toarray()

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Check for any extra features in the validation data
extra_features = set(feature_names) - set(vectorizer.get_feature_names())

if extra_features:
    # Filter out the extra features from the validation vectors
    valid_vectors = valid_vectors[:, [i for i, feature_name in enumerate(feature_names) if feature_name not in extra_features]]

# Perform padding on the input vectors
max_sequence_length = train_vectors.shape[1]
valid_vectors = pad_sequences(valid_vectors, maxlen=max_sequence_length)    
# train_vectors = pad_sequences(train_vectors, maxlen=max_sequence_length)    


model_names = ['Updated Linear SVC', 'Updated SGD Classifier']
file_names = ['LinearSVC_updated.joblib', 'SGDClassifier_updated.joblib']
    
# Load and evaluate each saved model
for model_name, file_name in zip(model_names, file_names):
    # Load the model
    if file_name.endswith('.joblib'):
        model = load(file_name)
    elif file_name.endswith('.h5'):
        model = load_model(file_name)
    else:
        raise ValueError(f"Unsupported file format: {file_name}")

    # Make predictions on the train and validation datasets
    train_pred = model.predict(train_vectors)
    valid_pred = model.predict(valid_vectors)

    # Calculate evaluation metrics for the train dataset
    accuracy_train = accuracy_score(train_data['label'], train_pred)
    precision_train = precision_score(train_data['label'], train_pred, pos_label='QUEER VOICES')
    recall_train = recall_score(train_data['label'], train_pred, pos_label='QUEER VOICES')
    f1_train = f1_score(train_data['label'], train_pred, pos_label='QUEER VOICES')

    # Calculate evaluation metrics for the validation dataset
    accuracy_valid = accuracy_score(valid_data['label'], valid_pred)
    precision_valid = precision_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')
    recall_valid = recall_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')
    f1_valid = f1_score(valid_data['label'], valid_pred, pos_label='QUEER VOICES')

    # Print the evaluation metrics for the train and validation datasets
    print(f"Train Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_train:.4f}")
    print(f"Precision: {precision_train:.4f}")
    print(f"Recall: {recall_train:.4f}")
    print(f"F1 Score: {f1_train:.4f}")
    print()
    print(f"Validation Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_valid:.4f}")
    print(f"Precision: {precision_valid:.4f}")
    print(f"Recall: {recall_valid:.4f}")
    print(f"F1 Score: {f1_valid:.4f}")
    print()

In [None]:
# Updateing the CNN model by making changes to its parameters and saving the updated model
# Load the train and validation datasets
train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")

# Drop rows with missing values
train_data = train_data.dropna()
valid_data = valid_data.dropna()

# Extract the text and label columns
train_text = train_data["text"].tolist()
train_labels = train_data["label"].tolist()
valid_text = valid_data["text"].tolist()
valid_labels = valid_data["label"].tolist()

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
valid_labels = label_encoder.transform(valid_labels)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
train_sequences = tokenizer.texts_to_sequences(train_text)
valid_sequences = tokenizer.texts_to_sequences(valid_text)

# Pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in train_sequences)
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
valid_sequences = pad_sequences(valid_sequences, maxlen=max_sequence_length)

# Create the CNN model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with modified parameters
model.fit(train_sequences, train_labels, batch_size=64, epochs=20, validation_data=(valid_sequences, valid_labels))

# Evaluate the model
valid_pred = model.predict(valid_sequences)
valid_pred_labels = np.argmax(valid_pred, axis=1)

accuracy = accuracy_score(valid_labels, valid_pred_labels)
precision = precision_score(valid_labels, valid_pred_labels, average='weighted')
recall = recall_score(valid_labels, valid_pred_labels, average='weighted')
f1 = f1_score(valid_labels, valid_pred_labels, average='weighted')

# Print the evaluation metrics
print(f"Accuracy for CNN model: {accuracy:.4f}")
print(f"Precision score for CNN model: {precision:.4f}")
print(f"Recall score for CNN model: {recall:.4f}")
print(f"F1 Score for CNN model: {f1:.4f}")

# Save the updated CNN model
model.save('cnn_model_updated.h5')


Both Linear SVC Model, SGD Classifier and Random Forest have achieved the desired accuracy of 99% and F1 score of 0.99.


In [None]:
# Performance evaluation of the updated CNN model
# Load the saved model
loaded_model = load_model("cnn_model_updated.h5")

# Evaluate the loaded model on train data
train_pred = loaded_model.predict(train_sequences)
train_pred_labels = np.argmax(train_pred, axis=1)

train_accuracy = accuracy_score(train_labels, train_pred_labels)
train_precision = precision_score(train_labels, train_pred_labels, average='weighted')
train_recall = recall_score(train_labels, train_pred_labels, average='weighted')
train_f1 = f1_score(train_labels, train_pred_labels, average='weighted')

# Evaluate the loaded model on valid data
valid_pred = loaded_model.predict(valid_sequences)
valid_pred_labels = np.argmax(valid_pred, axis=1)

valid_accuracy = accuracy_score(valid_labels, valid_pred_labels)
valid_precision = precision_score(valid_labels, valid_pred_labels, average='weighted')
valid_recall = recall_score(valid_labels, valid_pred_labels, average='weighted')
valid_f1 = f1_score(valid_labels, valid_pred_labels, average='weighted')

# Print the evaluation metrics for train and valid data
print("Train Data Metrics for Updated CNN:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")
print()
print("Valid Data Metrics for Updated CNN:")
print(f"Accuracy: {valid_accuracy:.4f}")
print(f"Precision: {valid_precision:.4f}")
print(f"Recall: {valid_recall:.4f}")
print(f"F1 Score: {valid_f1:.4f}")


In [None]:
# Error analysis of the updated Linear SVC and SGD Classifier model on train and valid dataset
# Fit the vectorizer on the entire train data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the train and validation text data using the vectorizer
valid_vectors = vectorizer.transform(valid_data['text'])

# Convert the sparse matrix to a dense matrix
valid_vectors = valid_vectors.toarray()

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Check for any extra features in the validation data
extra_features = set(feature_names) - set(vectorizer.get_feature_names())

if extra_features:
    # Filter out the extra features from the validation vectors
    valid_vectors = valid_vectors[:, [i for i, feature_name in enumerate(feature_names) if feature_name not in extra_features]]

# Perform padding on the input vectors
max_sequence_length = train_vectors.shape[1]
valid_vectors = pad_sequences(valid_vectors, maxlen=max_sequence_length)

model_names = ['Linear SVC Updated','SGD Classifier Updated']
file_names = ['LinearSVC_updated.joblib','SGDClassifier_updated.joblib']

# Load and evaluate each saved model
for model_name, file_name in zip(model_names, file_names):
    # Load the model
    if file_name.endswith('.joblib'):
        model = load(file_name)
    elif file_name.endswith('.h5'):
        model = load_model(file_name)
    else:
        raise ValueError(f"Unsupported file format: {file_name}")

    # Make predictions on the train and validation datasets
    train_pred = model.predict(train_vectors)
    valid_pred = model.predict(valid_vectors)

    # Perform error analysis
    train_errors = train_data[train_data['label'] != train_pred]
    valid_errors = valid_data[valid_data['label'] != valid_pred]

    print(f"Error Analysis for {model_name}:")
    print("Train Errors:")
    print(train_errors)
    print()

    print("Validation Errors:")
    print(valid_errors)
    print()

    print("-------------------------")


In [None]:
# Error analysis of the updated CNN model on train and valid dataset
# Load the saved model
loaded_model = load_model("cnn_model_updated.h5")

# Get the predicted labels for train and valid datasets
train_pred_labels = np.argmax(loaded_model.predict(train_sequences), axis=1)
valid_pred_labels = np.argmax(loaded_model.predict(valid_sequences), axis=1)

# Get the original labels for train and valid datasets
train_original_labels = train_labels
valid_original_labels = valid_labels

# Create a DataFrame to store the results for train dataset
train_error_analysis_df = pd.DataFrame({"Text": train_text, "Original Label": train_original_labels, "Predicted Label": train_pred_labels})

# Create a DataFrame to store the results for valid dataset
valid_error_analysis_df = pd.DataFrame({"Text": valid_text, "Original Label": valid_original_labels, "Predicted Label": valid_pred_labels})

# Filter the DataFrames to get misclassified examples
train_misclassified_df = train_error_analysis_df[train_error_analysis_df["Original Label"] != train_error_analysis_df["Predicted Label"]]
valid_misclassified_df = valid_error_analysis_df[valid_error_analysis_df["Original Label"] != valid_error_analysis_df["Predicted Label"]]

# Print the misclassified examples for train dataset
print("Misclassified Examples in Train Dataset:")
print(train_misclassified_df)

# Print the misclassified examples for valid dataset
print("Misclassified Examples in Valid Dataset:")
print(valid_misclassified_df)


In [None]:
import joblib

# Cross-validation on the updated Linear SVC and SGD Model

# Load the train and validation datasets
train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")

# Merge train and validation datasets
merged_data = pd.concat([train_data, valid_data], ignore_index=True)

# Drop rows with missing values
merged_data = merged_data.dropna()

# Extract the text and label columns
text = merged_data["text"].tolist()
labels = merged_data["label"].tolist()

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Vectorize the text data
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(text)

# Load the classifiers from saved .joblib files
classifiers = [
    ("Linear SVC", joblib.load("LinearSVC_updated.joblib")),
    ("SGD Classifier", joblib.load("SGDClassifier_updated.joblib"))
    ]

for classifier_name, classifier in classifiers:
    print(f"Results for {classifier_name}:")
    scores = cross_val_score(classifier, vectorized_text, encoded_labels, cv=5, scoring="accuracy")
    print("Cross-Validation Accuracy:", scores.mean())
    print("Cross-Validation Precision:", cross_val_score(classifier, vectorized_text, encoded_labels, cv=5, scoring="precision_weighted").mean())
    print("Cross-Validation Recall:", cross_val_score(classifier, vectorized_text, encoded_labels, cv=5, scoring="recall_weighted").mean())
    print("Cross-Validation F1 Score:", cross_val_score(classifier, vectorized_text, encoded_labels, cv=5, scoring="f1_weighted").mean())
    print()


In [44]:
# Cross-validation on the updated CNN model
# Load the train and validation datasets
train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")

# Merge train and validation datasets
merged_data = pd.concat([train_data, valid_data], ignore_index=True)

# Drop rows with missing values
merged_data = merged_data.dropna()

# Extract the text and label columns
text = merged_data["text"].tolist()
labels = merged_data["label"].tolist()

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

# Pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Load the saved CNN model
loaded_model = load_model("cnn_model_updated.h5")

# Create a function to build the CNN model for KerasClassifier
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_sequence_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Wrap the Keras model in KerasClassifier
keras_classifier = KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=32, verbose=0)

# Perform cross-validation
print("Results for CNN model:")
scores = cross_val_score(keras_classifier, sequences, encoded_labels, cv=5, scoring="accuracy")
print("Cross-Validation Accuracy:", scores.mean())
print("Cross-Validation Precision:", cross_val_score(keras_classifier, sequences, encoded_labels, cv=5, scoring="precision_weighted").mean())
print("Cross-Validation Recall:", cross_val_score(keras_classifier, sequences, encoded_labels, cv=5, scoring="recall_weighted").mean())
print("Cross-Validation F1 Score:", cross_val_score(keras_classifier, sequences, encoded_labels, cv=5, scoring="f1_weighted").mean())


Cross-Validation Recall: 0.8052370720077796
Cross-Validation F1 Score: 0.80554100341575


The Linear SVC model achieved the highest cross-validation accuracy, precision, recall, and F1 score among all the classifiers, showing robust performance on the merged dataset. It performs slightly better than the CNN model in terms of accuracy, precision, and F1 score.

The CNN model performs reasonably well and exhibits competitive results compared to other classifiers. However, it falls slightly behind the Linear SVC in terms of accuracy and F1 score, but it is on par with the SGD Classifier.
The SGD Classifier also demonstrates good performance, but it falls slightly behind the CNN model and Linear SVC in terms of accuracy, precision, recall, and F1 score.

Overall, all three models (CNN, Linear SVC, and SGD Classifier) perform reasonably well on the merged dataset, and there is no significant difference in their performance. Depending on the specific requirements and use case, one might prefer one model over the others. Further fine-tuning and optimization may help to improve the model's performance further.

In [45]:
# Performance evaluation of the Best chosen model - Linear SVC on test dataset
# Load the test dataset
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")

# Check for missing values in the 'text' column and drop rows with NaN
test_data = test_data.dropna(subset=['text'])
train_data = train_data.dropna(subset=['text'])


# Load the saved vectorizer from training
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['text'])

# Transform the test text data using the vectorizer
test_vectors = vectorizer.transform(test_data['text'])

# Load the saved models
linear_svc_model = load('LinearSVC_updated.joblib')

# Make predictions on the test data using each model
linear_svc_pred_test = linear_svc_model.predict(test_vectors)

# Calculate accuracy and F1 score for each model on the test data
linear_svc_accuracy_test = accuracy_score(test_data['label'], linear_svc_pred_test)
linear_svc_f1_test = f1_score(test_data['label'], linear_svc_pred_test, pos_label='QUEER VOICES')

# Print the results
print("Linear SVC - Accuracy for test dataset:", linear_svc_accuracy_test, "F1 Score test dataset:", linear_svc_f1_test)


Linear SVC - Accuracy for test dataset: 0.8002594033722439 F1 Score test dataset: 0.8487229862475442


For the Updated Linear SVC model, the evaluation metrics on the validation set are as follows:

Accuracy: 0.6381
Precision: 0.6362
Recall: 0.9871
F1 Score: 0.7737
On the other hand, the evaluation metrics on the test set are as follows:

Accuracy: 0.8003
F1 Score: 0.8487

Comparing the results, we can see that the accuracy and F1 score on the test set are higher than those on the validation set. Generally, it is expected that the performance on the validation set should be a good estimate of the model's performance on unseen data, such as the test set. However, there can be some variability due to differences in the data distribution between the two sets.

There are several reasons why we might observe such differences between validation and test performance:

Data Split: The data split between the validation set and the test set might not be entirely representative of the overall data distribution. There could be variations in the samples or labels between the two sets.

Model Overfitting: It is possible that the model is overfitting to the training data, leading to relatively lower performance on the validation set compared to the test set.

Randomness: Machine learning models can have some inherent randomness, especially in the case of models like Random Forest and SGDClassifier. These random factors can lead to performance variations on different subsets of data.

Class Imbalance: The class distribution in the test set might be different from that in the validation set, especially for the positive class ('QUEER VOICES'), leading to differences in precision and recall.

To obtain a more accurate estimate of the model's performance, it is essential to evaluate the model on multiple datasets and consider cross-validation. This helps to mitigate the impact of data splitting and randomness.

In summary, while the evaluation metrics on the validation set are a good indicator of model performance, it is normal to observe some variability when applying the model to unseen data (test set). The test set performance should be considered the most reliable measure of the model's generalization capability. If the test set performance is satisfactory, it indicates that the model has learned meaningful patterns and can perform well on new, unseen data. If the test set performance is significantly lower than the validation set, it could indicate potential issues like overfitting or data distribution discrepancies. In such cases, further analysis and tuning may be required.

Validation Metrics for Updated Linear SVC:

Accuracy: 0.6381
Precision: 0.6362
Recall: 0.9871
F1 Score: 0.7737
Linear SVC for Test Data:

Accuracy for the test dataset: 0.8003
F1 Score for the test dataset: 0.8487

We can see that the performance of the Linear SVC model on the test dataset is noticeably better than on the validation dataset by comparing the validation metrics to the test metrics. Particularly, the test dataset's accuracy and F1 score are higher than those of the validation dataset. This finding implies that the model generalizes well to unknown data, which is indicative of a successful model.

The more training data there was (2476 samples in the training set compared to 2476 + 619 = 3095 samples in the combined training and validation set), the better the model did on the test dataset. 

The model can learn stronger representations of the underlying patterns in the data with more diverse input, which will improve generalization. Additional complex interactions between characteristics and labels can be captured by the model with more data, which may improve performance.

It's important to remember, though, that the increase in training data may not be the only factor contributing to the performance improvement. Model performance can also be strongly impacted by additional elements including hyperparameter tweaking, feature selection, and data quality.

In conclusion, the results imply that the Linear SVC model has demonstrated greater generalization to the test dataset when trained on additional data, which is a promising result. It emphasizes the significance of having a sizable and varied dataset to train reliable and precise machine learning models.