<a href="https://colab.research.google.com/github/EdnaEze/Credit-Scoring/blob/main/new_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression

In [2]:
# load data
train_data = pd.read_csv("/content/drive/MyDrive/ee278/ANLP assignment/propaganda_dataset_v2/propaganda_train.tsv", delimiter="\t")
test_data = pd.read_csv("/content/drive/MyDrive/ee278/ANLP assignment/propaganda_dataset_v2/propaganda_val.tsv", delimiter="\t")

In [3]:
#Preprocess the data
train_sentences = [sentence.split() for sentence in train_data["tagged_in_context"]]
test_sentences = [sentence.split() for sentence in test_data["tagged_in_context"]]
train_labels = train_data["label"]
test_labels = test_data["label"]

train_labels_task2 = [0 if label=="not_propaganda" else 1 for label in train_labels]
test_labels_task2 = [0 if label=="not_propaganda" else 1 for label in test_labels]

In [19]:
# Define labels for task 1
label_map = {"not_propaganda": 0,
             "flag_waving": 1, 
             "appeal_to_fear_prejudice": 2, 
             "causal_oversimplification": 3, 
             "doubt": 4, 
             "exaggeration,minimisation": 5, 
             "loaded_language": 6, 
             "name_calling,labeling": 7, 
             "repetition": 8
             }

train_labels_task1 = [label_map[label] for label in train_labels]
test_labels_task1 = [label_map[label] for label in test_labels]

In [20]:
#Train Word2Vec model
w2v_model = Word2Vec(train_sentences, vector_size=300, window=5, min_count=1, workers=4)

In [21]:
#Define function to convert sentences to vectors using Word2Vec model
def sentence_to_vec(sentence, model):
    vec = np.zeros(300)
    count = 0
    for word in sentence:
        try:
            vec += model.wv[word]
            count += 1
        except KeyError:
            pass
    if count != 0:
        vec /= count
    return vec

In [22]:
#Convert sentences to vectors using Word2Vec model
train_embeddings = [sentence_to_vec(sentence, w2v_model) for sentence in train_sentences]
test_embeddings = [sentence_to_vec(sentence, w2v_model) for sentence in test_sentences]

In [34]:
# Train a logistic regression classifier for task 1
lr1 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
lr1.fit(train_embeddings, train_labels)

predictions1 = lr1.predict(test_embeddings)
accuracy1 = accuracy_score(test_labels, predictions1)
report1 = classification_report(test_labels, predictions1, target_names=label_map.keys(), zero_division=1)

In [35]:
# Print the results
print("Task 1 accuracy: ", accuracy1)
print("Task 1 classification report:\n", report1)

Task 1 accuracy:  0.5189655172413793
Task 1 classification report:
                            precision    recall  f1-score   support

           not_propaganda       1.00      0.00      0.00        43
              flag_waving       1.00      0.00      0.00        31
 appeal_to_fear_prejudice       1.00      0.00      0.00        38
causal_oversimplification       1.00      0.00      0.00        28
                    doubt       1.00      0.00      0.00        39
exaggeration,minimisation       1.00      0.00      0.00        37
          loaded_language       1.00      0.00      0.00        31
    name_calling,labeling       0.52      1.00      0.68       301
               repetition       1.00      0.00      0.00        32

                 accuracy                           0.52       580
                macro avg       0.95      0.11      0.08       580
             weighted avg       0.75      0.52      0.35       580



In [29]:
# Train a logistic regression classifier for task 2
lr2 = LogisticRegression(random_state=42, multi_class="ovr", solver="newton-cg")
lr2.fit(train_embeddings, train_labels_task2)

predictions2 = lr2.predict(test_embeddings)
accuracy2 = accuracy_score(test_labels_task2, predictions2)
report2 = classification_report(test_labels_task2, predictions2, target_names=["not_propaganda", "propaganda"])

In [32]:
print("Task 2 accuracy: ", accuracy2)
print("Task 2 classification report:\n", report2)

Task 2 accuracy:  0.5586206896551724
Task 2 classification report:
                 precision    recall  f1-score   support

not_propaganda       0.55      0.77      0.64       301
    propaganda       0.57      0.33      0.42       279

      accuracy                           0.56       580
     macro avg       0.56      0.55      0.53       580
  weighted avg       0.56      0.56      0.54       580

