In [1]:
import os;
import os.path;

import numpy as np;
import pandas as pd;

import nltk;
import nltk.corpus;
import nltk.tokenize;
nltk.download('punkt');
nltk.download('stopwords');

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def removePunctuation(text:list[str]):
    buffer = [];
    punc_list = ["!","#","$","%","&","(",")","*",\
                 "+","/",":",",",";",".","<","=",\
                 ">","@","[","\\","]","^","`","{",\
                 "|","}","~","\t","\n","-","\"","\'"];
    for word in text:
        if word not in punc_list:
            buffer.append(word);

    return buffer;

In [3]:
def removeStopWords(text:list[str]):
    buffer = [];
    stop_words = set(nltk.corpus.stopwords.words('english'));
    for word in stop_words:
        if word not in text:
            buffer.append(word);

    return buffer;

In [4]:
def prepDataset(path_x=None,path_y=None):
    if path_x is None or path_y is None:
        raise Exception("Path_x or Path_y is not provided...");
    else:
        X = [];
        with open(path_x) as file:
            lines = file.readlines();
            for line in lines:
                words = nltk.word_tokenize(line);
                words = removePunctuation(words);
                words = removeStopWords(words);
                X.append(" ".join(words).lower());

        with open(path_y) as file:
            y = file.readlines();

        return X,[int(_) for _ in y];

In [5]:
def prepTestset(path):
    if path is None:
        raise Exception("Path is expected..");
    else:
        X = [];
        with open(path) as file:
            lines = file.readlines();
            for line in lines:
                words = nltk.word_tokenize(line);
                words = removePunctuation(words);
                words = removeStopWords(words);
                X.append(" ".join(words).lower());

        return X;

In [7]:
train_x,train_y = prepDataset("/content/drive/MyDrive/Webis/DATA/trainset_cmb.txt",\
                              "/content/drive/MyDrive/Webis/y_train.txt");
val_x,val_y = prepDataset("/content/drive/MyDrive/Webis/DATA/valset_cmb.txt",\
                            "/content/drive/MyDrive/Webis/y_val.txt");

In [8]:
print(len(train_x),len(train_y));
print(len(val_x),len(val_y));

3200 3200
800 800


In [9]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer;
from sklearn.naive_bayes import MultinomialNB;
from sklearn.metrics import accuracy_score;

In [10]:
# unigram
vectorizer = CountVectorizer(ngram_range=(1,1));
X_train = vectorizer.fit_transform(train_x);
X_val = vectorizer.transform(val_x);

classifier = MultinomialNB();
classifier.fit(X_train,train_y);
y_pred = classifier.predict(X_val);

print(accuracy_score(val_y,y_pred));

0.42375


In [11]:
# bigram
vectorizer = CountVectorizer(ngram_range=(2,2));
X_train = vectorizer.fit_transform(train_x);
X_val = vectorizer.transform(val_x);

classifier = MultinomialNB();
classifier.fit(X_train,train_y);
y_pred = classifier.predict(X_val);

print(accuracy_score(val_y,y_pred));

0.42625


In [12]:
# unigram + bigram
vectorizer = CountVectorizer(ngram_range=(1,2));
X_train = vectorizer.fit_transform(train_x);
X_val = vectorizer.transform(val_x);

classifier = MultinomialNB();
classifier.fit(X_train,train_y);
y_pred = classifier.predict(X_val);

print(accuracy_score(val_y,y_pred));

0.425


In [13]:
# tf-idf
tf_vectorizer = TfidfVectorizer();
X_train = tf_vectorizer.fit_transform(train_x);
X_val = tf_vectorizer.transform(val_x);

tf_classifier = MultinomialNB();
tf_classifier.fit(X_train,train_y);
y_pred = tf_classifier.predict(X_val);

print(accuracy_score(val_y,y_pred));

0.45125
