In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import random
from collections import Counter


In [3]:
# read data from file
data = pd.read_csv('a3_train_final.tsv',sep='\t', header=None, names=['label', 'text'])

In [6]:
# This function separates the annotations in the label column and marks indecisive cases like 1/0 with -1. -1 will be dropped later
# Function to process labels
def process_labels(label):
    label_list = label.split("/")
    label_list = [l for l in label_list if l != "-1"]  # Drop -1 labels
    if len(label_list) == 1:
        return label_list[0]  # Return the single label
    else:
        counter = Counter(label_list)
        max_vote = max(counter.values())
        majority_labels = [l for l, count in counter.items() if count == max_vote]
        if len(majority_labels) == 1:
            return random.choice(majority_labels)  # Randomly choose from majority labels
        else:
            return -1

In [7]:
# Apply label processing
data['final_label'] = data['label'].apply(process_labels)

In [17]:
# checking if it worked
data.head(15)

Unnamed: 0,label,text,final_label
0,1/1,I'll only consume if I know what's inside it....,1
1,0/-1,It is easier to fool a million people than it...,0
2,0/0,NATURAL IMMUNITY protected us since evolutio...,0
3,0/-1,NATURAL IMMUNITY protected us since evolutio...,0
4,0/0,"Proud to have resisted. Proud of my husband, ...",0
5,1/1/1/-1,The bigest sideffect of vaccines is fewer dea...,1
6,1/-1,Unvaccinated people are more likely to become...,1
7,1/1,Vaccine takes more than a year to develop. T...,1
8,0/0,YES IM A TRUCKER FROM USA AND I WONT GET VACC...,0
9,0/0,"covid vaccines are safe , Goes to show , if ...",0


In [18]:
len(data)

50068

In [15]:
# filter out indecisive text and drop label column; not relevant for training and evaluation
df = data[data['final_label'] != -1]
df = df.drop(['label'], axis=1)

In [27]:
# Drop duplicates based on the "text" column
final_df= df.drop_duplicates(subset=['text'])

# Create a DataFrame containing only the dropped rows
dropped_rows_df = df[~df.index.isin(final_df.index)]

# Print the DataFrame with dropped duplicates
print("DataFrame with dropped duplicates:")
print(final_df)

# Print the DataFrame containing dropped rows
print("\nDataFrame containing dropped rows:")
print(dropped_rows_df)

DataFrame with dropped duplicates:
                                                    text final_label
0       I'll only consume if I know what's inside it....           1
1       It is easier to fool a million people than it...           0
2       NATURAL IMMUNITY  protected us since evolutio...           0
3       NATURAL IMMUNITY  protected us since evolutio...           0
4       Proud to have resisted. Proud of my husband, ...           0
...                                                  ...         ...
50063  🤣 keep your 💩 I already know 3 people who have...           0
50064  🤣🤣🤣 "JUST BECAUSE IT'S SAFE, DOESN'T MEAN IT D...           0
50065  🤣🤣🤣 I took the Vaccine because of work. If I d...           0
50066  🤨there's people already having severe side eff...           0
50067  🥦I ❤my covid vaccines and I'm so excited for m...           1

[47370 rows x 2 columns]

DataFrame containing dropped rows:
                                                    text final_label
7655  

In [28]:
# Divide into label and data
X = df['text']
y = df['final_label']
# Divide data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Pipeline for Logistic Regression with TfidfVectorizer
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

In [34]:
# Hyperparameters
# Define the parameter grid
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1.0, 10.0],
    'clf__penalty': ['l1', 'l2'],  # Regularization penalty
    'clf__solver': ['liblinear', 'saga'],  # Solver algorithm for logistic regression
    'clf__max_iter': [1000],  # Maximum number of iterations
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)


Best hyperparameters: {'clf__C': 10.0, 'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 2)}


In [35]:
# Evaluate on test set
accuracy = grid_search.best_estimator_.score(X_test, y_test)
print("Accuracy on test set:", accuracy)

Accuracy on test set: 0.8162532981530343


In [None]:
from sklearn.metrics import classification_report

# Generate predictions
y_pred = grid_search.best_estimator_.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)

# Print the report
print("Classification Report:\n", report)