# Path

In [77]:
# Global path
GLOBAL_PATH = "C:/Users/rayji/OneDrive/Documents/Projet 2 ML"

# GloVe
GLOVE_PATH = f"{GLOBAL_PATH}/data/glove.twitter.27B.100d.txt"

# Train full
TRAIN_NEG_FULL_PATH = f"{GLOBAL_PATH}/data/train_neg_full.txt"
TRAIN_POS_FULL_PATH = f"{GLOBAL_PATH}/data/train_pos_full.txt"

# Train
TRAIN_NEG_PATH = f"{GLOBAL_PATH}/data/train_neg.txt"
TRAIN_POS_PATH = f"{GLOBAL_PATH}/data/train_pos.txt"

# Test
TEST_PATH = f"{GLOBAL_PATH}/data/test_data.txt"

# Preprocessed data
TRAIN_PREP_PATH = f"{GLOBAL_PATH}/data/preprocessed/train_gru.csv"
TEST_PREP_PATH = f"{GLOBAL_PATH}/data/preprocessed/test_gru.csv"

# Weight
WEIGHT_PATH = f"{GLOBAL_PATH}/weight"

# Abstract method

In [2]:
from sklearn.model_selection import train_test_split
from abc import ABC, abstractmethod
import pandas as pd

In [3]:
class AbstractModel(ABC):
    def __init__(self, weights_path: str):
        self.__weights_path = weights_path


    @abstractmethod
    def get_preprocessing_methods(self, is_test: bool = False):
        pass


    @abstractmethod
    def fit_predict(self, X, y, ids_test, X_test, prediction_path):
        pass


    @abstractmethod
    def predict(self, ids, X, path):
        pass


    @staticmethod
    def _create_submission(ids: list[int], predictions: list[int], path: str):
        # Generating the submission file
        submission = pd.DataFrame(columns=["Id", "Prediction"],
                                data={"Id": ids, "Prediction": predictions})

        # For many models the labels are 0 or 1. Replacing 0s with -1s.
        submission["Prediction"].replace(0, -1, inplace=True)

        # Saving the file
        submission.to_csv(path, index=False)


    @staticmethod
    def _split_data(X: pd.DataFrame, y: pd.DataFrame, test_size: float = 0.2, random_state: int = 42, **kwargs) -> tuple:
        print("Splitting data in train and test set...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, **kwargs)

        return X_train, X_test, y_train, y_test

# Preprocessing


In [6]:
!pip install symspellpy

Collecting symspellpy
  Downloading symspellpy-6.7.7-py3-none-any.whl (2.6 MB)
     ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
      --------------------------------------- 0.1/2.6 MB 825.8 kB/s eta 0:00:04
     -- ------------------------------------- 0.1/2.6 MB 1.2 MB/s eta 0:00:03
     ----- ---------------------------------- 0.4/2.6 MB 2.3 MB/s eta 0:00:01
     ------------ --------------------------- 0.8/2.6 MB 4.1 MB/s eta 0:00:01
     ------------------- -------------------- 1.3/2.6 MB 5.1 MB/s eta 0:00:01
     ----------------------------- ---------- 1.9/2.6 MB 6.4 MB/s eta 0:00:01
     ---------------------------------------  2.6/2.6 MB 7.5 MB/s eta 0:00:01
     ---------------------------------------- 2.6/2.6 MB 7.2 MB/s eta 0:00:00
Collecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.3.tar.gz (57 kB)
     ---------------------------------------- 0.0/57.2 kB ? 

  error: subprocess-exited-with-error
  
  × Building wheel for editdistpy (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [6 lines of output]
      running bdist_wheel
      running build
      running build_ext
      building 'editdistpy.levenshtein' extension
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for editdistpy
ERROR: Could not build wheels for editdistpy, which is required to install pyproject.toml-based projects


In [4]:
import pkg_resources
import nltk
import re
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from symspellpy import SymSpell # Fuzzy search and word correction

In [5]:
# nltk weights
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
import importlib
import preprocessing
from preprocessing import *
# Reload the library
importlib.reload(preprocessing)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


<module 'preprocessing' from 'c:\\Users\\rayji\\OneDrive\\Documents\\GitHub\\ml-project-2-big-three\\preprocessing.py'>

In [None]:
    methods.extend([
    'drop_duplicates',
    'remove_parentheses',
    'remove_parentheses',
    'remove_parentheses',
    'remove_tag',
    'remove_selected_characters',
    'correct_spacing_indexing',
    'remove_space_between_emoticons',
    'correct_spacing_indexing',
    'hashtags_to_tags',
    'correct_spacing_indexing',
    'word_segmentation',
    'correct_spacing_indexing',
    'slang_to_word',
    'correct_spacing_indexing',
    'correct_spelling',
    'lemmatize',
    'remove_stopwords',
    'numbers_to_tags',
    'replace_entities_with_tags',
    'correct_spacing_indexing'
    ])

In [110]:
def run_preprocessing(
    train_preprocessed_path=TRAIN_PREP_PATH, test_preprocessed_path=TEST_PREP_PATH, full_data=True
):
    """
    Runs the preprocessing methods according to the chosen classifier
      on the train and test data

    :param csr: chosen classifier (child of AbstractModel)
    :type csr: AbstractModel
    :param train_preprocessed_path: path to load train data
    :type train_preprocessed_path: str
    :param test_preprocessed_path: path to load test data
    :type test_preprocessed_path: str
    :param full_data: if False, the small dataset (200K rows) is used
    :type full_data: bool, optional
    """

    # Read data
    if full_data:
        dataset_files = [TRAIN_NEG_FULL_PATH, TRAIN_POS_FULL_PATH]
    else:
        dataset_files = [TRAIN_NEG_PATH, TRAIN_POS_PATH]

    train_preprocessing = Preprocessing(dataset_files, is_test=False)
    test_preprocessing = Preprocessing([TEST_PATH], is_test=True)

    # Preprocess it

    methods = []

    methods.extend([
        'drop_duplicates',
        'correct_spacing_indexing',
        'remove_parentheses',
        'correct_spacing_indexing',
        'word_segmentation',
        'correct_spacing_indexing',
        'correct_spacing_indexing',
        'remove_selected_characters',
        'hashtags_to_tags',
        'correct_spacing_indexing',
        'lemmatize',
        ])

    for method in methods:
        getattr(train_preprocessing, method)()

    methods.append('drop_duplicates')

    for method in methods:
        getattr(test_preprocessing, method)()

    train_df = train_preprocessing.__get__()
    #train_df.to_csv(train_preprocessed_path, index=False)

    test_df = test_preprocessing.__get__()
    #test_df.to_csv(test_preprocessed_path, index=False)

    return train_df, test_df

In [113]:
train_preprocessed_dataset, test_preprocessed_dataset = run_preprocessing(full_data=False)
train_preprocessed_dataset.to_csv('C:/Users/rayji/OneDrive/Documents/GitHub/ml-project-2-big-three/data/preprocessed/train_preprocessed.csv', index=False)
test_preprocessed_dataset.to_csv('C:/Users/rayji/OneDrive/Documents/GitHub/ml-project-2-big-three/data/preprocessed/test_preprocessed.csv', index=False)

Executing: `drop_duplicates`
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `remove_parentheses`
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `word_segmentation`


100%|██████████| 181321/181321 [00:35<00:00, 5140.75it/s]


Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `remove_selected_characters`
Removing selected characters...
Executing: `hashtags_to_tags`
Converting hashtags to tags...
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `lemmatize`


100%|██████████| 181321/181321 [02:15<00:00, 1335.05it/s]


Executing: `drop_duplicates`
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `remove_parentheses`
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `word_segmentation`


100%|██████████| 10000/10000 [00:03<00:00, 2799.74it/s]


Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `remove_selected_characters`
Removing selected characters...
Executing: `hashtags_to_tags`
Converting hashtags to tags...
Executing: `correct_spacing_indexing`
Correcting spacing...
Executing: `lemmatize`


100%|██████████| 10000/10000 [00:06<00:00, 1513.46it/s]


Executing: `drop_duplicates`
Executing: `__get__`
Executing: `__get__`


In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_preprocessed_dataset['text'], train_preprocessed_dataset['label'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=100000)
model.fit(X_train_tfidf, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7910657658899766


In [130]:
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import words
from collections import Counter


# Download the English language dictionary
nltk.download('words')
nltk.download('punkt')


texts = test_preprocessed_dataset['text']

# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Remove non-alphabetic characters (keep only letters, parentheses, numbers, '.', '-', and '!')
    clean_text = re.sub(r'[^a-zA-Z0-9.!\-()]', ' ', text)
    # Tokenize
    return word_tokenize(clean_text.lower())

# Tokenize the text
tokens = [clean_and_tokenize(text) for text in texts]
flat_tokens = [item for sublist in tokens for item in sublist]


# Load a set of standard English words
english_words = set(words.words())

# Filter out standard English words
non_standard_tokens = [word for word in flat_tokens if word not in english_words]

# Count the frequencies
word_freq = Counter(non_standard_tokens)

# You can define a threshold for what you consider 'high frequency'
high_freq_threshold = 10  # Example threshold
high_freq_words = {word: count for word, count in word_freq.items() if count > high_freq_threshold}

ordered_words = {word: count for word, count in sorted(high_freq_words.items(), key=lambda item: item[1], reverse=True)}
print(ordered_words)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'!': 3847, '.': 3108, '(': 1954, 'firstname': 1314, '-': 778, ')': 361, 'fuck': 208, 'cd': 169, 'xxx': 140, 'hama': 137, 'fucking': 113, 'shit': 79, 'friday': 73, 'pc': 63, 'pm': 62, 'anymore': 47, 'eth': 46, 'hardcover': 45, 'int': 42, 'boyfriend': 40, 'proud': 39, 'okay': 38, 'hmm': 37, 'mb': 36, 'kid': 35, 'facebook': 35, 'saturday': 32, 'justin': 29, 'hang': 28, 'co': 27, 'april': 25, 'english': 25, 'fax': 24, 'der': 23, 'american': 23, 'monday': 23, 'sunday': 21, 'email': 21, 'chelsea': 21, 'girlfriend': 20, 'blog': 20, 'online': 20, 'inc': 19, 'joel': 19, 'oof': 17, 'thursday': 17, 'liam': 17, 'gb': 17, 'mah': 16, 'favourite': 16, 'youtube': 16, 'etc': 16, 'spanish': 15, 'def': 14, 'philippine': 14, 'sch': 13, 'fave': 13, 'girls': 13, 'tuesday': 12, 'internet': 12, 'samsung': 12, 'est': 11, 'ord': 11, 'christian': 11, 'nokia': 11}


In [124]:
import pandas as pd

# Convert the dictionary to a DataFrame
df_ordered_words = pd.DataFrame.from_dict(ordered_words, orient='index', columns=['Count'])

# Export the DataFrame to a CSV file
df_ordered_words.to_csv('data/preprocessed/ordered_words.csv', index_label='Word')


# Old

In [10]:
import pandas as pd
import importlib
import utils 
from utils import *
from os import listdir
from os.path import isfile, join
from random import shuffle
# Reload the library
importlib.reload(utils)

<module 'utils' from 'c:\\Users\\rayji\\OneDrive\\Documents\\GitHub\\ml-project-2-big-three\\utils.py'>

In [12]:
data_train = load_data()
test_data = load_test_data()

In [13]:
# Print the number of lines in each dataset
print(data_train.shape)
print(test_data.shape)

(200000, 2)
(10000, 1)


In [5]:
# Use read_pickle() to open the .pkl file
#vocab = pd.read_pickle('vocab.pkl')
#cooc_matrix = pd.read_pickle('cooc.pkl')

In [6]:
import scipy.sparse

# Assuming 'sparse_matrix' is your sparse matrix
# Replace row_index and col_index with the specific indices you want to access
row_index = 0
col_index = 0

# Access the element at the specified row and column indices
row = cooc_matrix.getrow(row_index)
element = row[0, col_index]

# 'element' now contains the value at the specified position in the sparse matrix
element

NameError: name 'cooc_matrix' is not defined

In [15]:
data_train

Unnamed: 0,text,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
199995,can't wait to fake tan tonight ! hate being pale,0
199996,<user> darling i lost my internet connection ....,0
199997,kanguru defender basic 4 gb usb 2.0 flash driv...,0
199998,rizan is sad now,0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_train['text'], data_train['label'], test_size=0.01, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=5000)
model.fit(X_train_tfidf, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.808


In [17]:
y_test_pred = model.predict(vectorizer.transform(test_data['text']))

In [39]:
export_submission(y_test_pred)