In [1]:
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models import FastText
from sklearn.model_selection import train_test_split , learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv("data.csv")

# Apply text preprocessing
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = df["text_clean"].str.replace(r'\d+', '', regex=True)
df["text_clean"] = df["text_clean"].str.translate(str.maketrans('', '', string.punctuation))
df["text_clean"] = df["text_clean"].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))

# Tokenize text for FastText
sentences = [text.split() for text in df["text_clean"]]

# Train FastText model
fasttext_model = FastText(sentences, vector_size=500, window=10, min_count=2, workers=20, sg=0)

# Function to get sentence vectors
def get_sentence_vector(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Convert text data to FastText feature vectors
X_vectors = np.array([get_sentence_vector(text.split(), fasttext_model) for text in df["text_clean"]])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vectors, df["label"], test_size=0.2, random_state=42)

# Train Logistic Regression Model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hano\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 0.9811


In [3]:
'''
vector_size: The dimensionality of the word vectors.
window: The size of the context window used to predict words.
min_count: The minimum frequency a word must have to be considered in the model.
workers: The number of CPU cores to use for training.
sg: The training algorithm (0 for CBOW, 1 for Skip-gram)

'''

'\nvector_size: The dimensionality of the word vectors.\nwindow: The size of the context window used to predict words.\nmin_count: The minimum frequency a word must have to be considered in the model.\nworkers: The number of CPU cores to use for training.\nsg: The training algorithm (0 for CBOW, 1 for Skip-gram)\n\n'

In [4]:
from sklearn.model_selection import train_test_split , learning_curve
import matplotlib.pyplot as plt


In [9]:
import json

accuracy = 0.9811 
with open("accuracy_FastText_LR.json", "w") as f:
    json.dump({"accuracy": accuracy}, f)