<a href="https://colab.research.google.com/github/Cutie-tee/nlp_project/blob/main/nlp_textclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the Data

In [21]:
import pandas as pd
#Load the dataset
data =pd.read_csv('/content/training_data_lowercase.csv')
# column names
column_names = data.columns.tolist()
print("Column names:", column_names)

Column names: ['0\tdonald trump sends out embarrassing new year‚s eve message; this is disturbing']


In [22]:
# Reload the CSV with tab delimiter
data = pd.read_csv('/content/training_data_lowercase.csv', delimiter='\t')

# Rename the columns for clarity
data.columns = ['label', 'text']
# Display the first few rows to confirm
data.head()


Unnamed: 0,label,text
0,0,drunk bragging trump staffer started russian c...
1,0,sheriff david clarke becomes an internet joke ...
2,0,trump is so obsessed he even has obama‚s name ...
3,0,pope francis just called out donald trump duri...
4,0,racist alabama cops brutalize black boy while ...


Data Preprocessing ( Cleaning& Tokenisation)

In [None]:
#Clean the text to convert to lowercase, remove punctuation, whitespace , stop words, toeksnising andlemmatising

import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Define stop words and preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the 'text' column
data['cleaned_text'] = data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Converting Test to Numerical Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']  # Assuming 'label' column is already numeric; if not, convert it with LabelEncoder


 Vectorize Text Data with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for efficiency
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']  # Assuming 'label' is the target variable


Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Training with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train classifier
model = LogisticRegression()
model.fit(X_train, y_train)


Evaluate the model

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9371980676328503
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      3517
           1       0.94      0.94      0.94      3314

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



Cross validation with SVM  


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import numpy as np

# Extract text and labels
texts = data['text']
labels = data['label']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
y = labels



In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np

# Initialize the SVM model
svm_model = SVC(kernel='linear')

# Perform 5-fold cross-validation
cv_scores = cross_val_score(svm_model, X, y, cv=5)

# Print the accuracy for each fold and the mean accuracy
print("Cross-Validation Scores for each fold:", cv_scores)
print("Average Cross-Validation Score:", np.mean(cv_scores))


Cross-Validation Scores for each fold: [0.90469917 0.92093704 0.9033675  0.94304539 0.93704246]
Average Cross-Validation Score: 0.9218183104197492


Hyperparameter tuning

Save the model and vectorizer  

In [None]:
import pickle

# Save the model
with open('text_classifier.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
