In [None]:
import re
import sys
import nltk
import spacy
import numpy as np
import pandas as pd

import multiprocessing
from functools import partial
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
#!{sys.executable} -m pip install contractions
import contractions

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
rawdf = pd.read_csv('file2.csv', engine="python", on_bad_lines="skip")
rawdf.dropna(inplace=True)

In [None]:
print(rawdf.shape[0])

4489


In [None]:
rawdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4489 entries, 0 to 4834
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4489 non-null   object
 1   label   4489 non-null   object
dtypes: object(2)
memory usage: 105.2+ KB


In [None]:
labels = ['AVI', 'MIS', 'OTH', 'WIN', 'WEA',
           'COC', 'PRP', 'MEC', 'ELC', 'LAG',
           'FLC', 'FFC', 'ECS', 'FLU']

def filter_by_label(df):
  filtered_df = rawdf[rawdf['label'].apply(lambda x: isinstance(x, str) and x in labels)]
  return filtered_df

df = filter_by_label(rawdf)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4374 entries, 0 to 4834
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4374 non-null   object
 1   label   4374 non-null   object
dtypes: object(2)
memory usage: 102.5+ KB


In [None]:
class_counts = df.groupby('label').size().reset_index(name='count')
print(class_counts)

   label  count
0    AVI   1633
1    COC    370
2    ECS     12
3    ELC     18
4    FFC     24
5    FLC    109
6    FLU      3
7    LAG     36
8    MEC     29
9    MIS    872
10   OTH    821
11   PRP     99
12   WEA    330
13   WIN     18


In [None]:
def expand_contractions(text):
    contractions = {
        "n't": "not",
        "'ve": "have",
        "'ll": "will",
        "'d": "would",
        "'re": "are",
        "'s": "is",
        "'m": "am"
    }
    pattern = re.compile(r"\b(?:" + "|".join(contractions.keys()) + r")\b")
    return pattern.sub(lambda match: contractions[match.group(0)], text)

# Function to remove special characters
def remove_special_characters(text):
    special_characters_pattern = re.compile(r"[#%@\<>{}()=\[\]*\-\\|_½]")
    return special_characters_pattern.sub("", text)


def preprocess_text(text):
    text = expand_contractions(text)
    text = remove_special_characters(text)

    tokens = nltk.word_tokenize(text)

    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_tokens = [token.lower() for token in lemmatized_tokens]

    stop_words = set(nltk.corpus.stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]

    return filtered_tokens

df['text'] = df['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(preprocess_text)


In [None]:
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.1, random_state=42)

X_train = [' '.join(tokens) for tokens in X_train]
X_test = [' '.join(tokens) for tokens in X_test]

tfidf_vectorizer = TfidfVectorizer(max_features=20000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

start = time.time()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
end = time.time()

report = classification_report(y_test, y_pred_nb)

print(f"MultinomialNB Execution Time: {end-start}")
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_nb))
print(report)

MultinomialNB Execution Time: 0.04742002487182617
MultinomialNB Accuracy: 0.3972602739726027
              precision    recall  f1-score   support

         AVI       0.37      0.98      0.54       154
         COC       0.00      0.00      0.00        36
         ECS       0.00      0.00      0.00         2
         FFC       0.00      0.00      0.00         3
         FLC       0.00      0.00      0.00        11
         LAG       0.00      0.00      0.00         4
         MEC       0.00      0.00      0.00         2
         MIS       0.83      0.06      0.12        78
         OTH       0.86      0.19      0.31        94
         PRP       0.00      0.00      0.00        11
         WEA       0.00      0.00      0.00        41
         WIN       0.00      0.00      0.00         2

    accuracy                           0.40       438
   macro avg       0.17      0.10      0.08       438
weighted avg       0.46      0.40      0.28       438



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=20000)

start = time.time()
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
end = time.time()

report = classification_report(y_test, y_pred_lr)

print(f"Logistic Regression Execution Time: {end-start}")
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(report)

Logistic Regression Execution Time: 18.54921817779541
Logistic Regression Accuracy: 0.5114155251141552
              precision    recall  f1-score   support

         AVI       0.47      0.77      0.58       154
         COC       0.56      0.14      0.22        36
         ECS       0.00      0.00      0.00         2
         FFC       0.00      0.00      0.00         3
         FLC       1.00      0.09      0.17        11
         LAG       0.00      0.00      0.00         4
         MEC       0.00      0.00      0.00         2
         MIS       0.49      0.47      0.48        78
         OTH       0.58      0.48      0.53        94
         PRP       1.00      0.18      0.31        11
         WEA       0.75      0.37      0.49        41
         WIN       0.00      0.00      0.00         2

    accuracy                           0.51       438
   macro avg       0.40      0.21      0.23       438
weighted avg       0.54      0.51      0.48       438



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.svm import LinearSVC
svm_model = LinearSVC()

start = time.time()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
end = time.time()

report = classification_report(y_test, y_pred_svm)

print(f"Linear SVM Execution Time: {end-start}")
print("Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(report)

Linear SVM Execution Time: 1.5595958232879639
Linear SVM Accuracy: 0.5045662100456622
              precision    recall  f1-score   support

         AVI       0.49      0.64      0.56       154
         COC       0.57      0.22      0.32        36
         ECS       0.50      0.50      0.50         2
         ELC       0.00      0.00      0.00         0
         FFC       1.00      0.33      0.50         3
         FLC       0.43      0.27      0.33        11
         LAG       1.00      0.25      0.40         4
         MEC       0.00      0.00      0.00         2
         MIS       0.44      0.47      0.46        78
         OTH       0.58      0.54      0.56        94
         PRP       1.00      0.36      0.53        11
         WEA       0.46      0.39      0.42        41
         WIN       0.00      0.00      0.00         2

    accuracy                           0.50       438
   macro avg       0.50      0.31      0.35       438
weighted avg       0.52      0.50      0.49     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

start = time.time()
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
end = time.time()

report = classification_report(y_test, y_pred_rf)

print(f"Random Forest Execution Time: {end-start}")
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(report)

Random Forest Execution Time: 20.6485276222229
Random Forest Accuracy: 0.4611872146118721
              precision    recall  f1-score   support

         AVI       0.42      0.84      0.56       154
         COC       0.40      0.06      0.10        36
         ECS       0.00      0.00      0.00         2
         ELC       0.00      0.00      0.00         0
         FFC       0.00      0.00      0.00         3
         FLC       0.50      0.18      0.27        11
         LAG       0.00      0.00      0.00         4
         MEC       0.00      0.00      0.00         2
         MIS       0.48      0.28      0.35        78
         OTH       0.64      0.41      0.50        94
         PRP       1.00      0.09      0.17        11
         WEA       0.70      0.17      0.27        41
         WIN       0.00      0.00      0.00         2

    accuracy                           0.46       438
   macro avg       0.32      0.16      0.17       438
weighted avg       0.50      0.46      0.41 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
