In [7]:
import pandas as pd
import numpy as np
import re
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC  # Using SVM instead of Naive Bayes
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk

# Download NLTK resources
nltk.download('stopwords')

# File paths
training_file = '/content/drive/MyDrive/Data_Mining/twitter_training.csv'
validation_file = '/content/drive/MyDrive/Data_Mining/twitter_validation.csv'

# Dataset columns and encoding
DATASET_COLUMNS = ['id', 'category', 'sentiment', 'text']
DATASET_ENCODING = "ISO-8859-1"

# Load datasets
train_df = pd.read_csv(training_file, encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
valid_df = pd.read_csv(validation_file, encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

# Map sentiment labels to numerical values
sentiment_mapping = {
    "Negative": 0,
    "Neutral": 2,
    "Positive": 4,
    "Irrelevant": 1
}

train_df['sentiment'] = train_df['sentiment'].map(sentiment_mapping)
valid_df['sentiment'] = valid_df['sentiment'].map(sentiment_mapping)

# Drop unnecessary columns (keeping only 'text' and 'sentiment')
train_df = train_df[['text', 'sentiment']]
valid_df = valid_df[['text', 'sentiment']]

# Verify mapping
print("Training Data Target Values:", train_df['sentiment'].unique())
print("Validation Data Target Values:", valid_df['sentiment'].unique())

# Fill missing values with empty strings
train_df['text'] = train_df['text'].fillna("").astype(str)
valid_df['text'] = valid_df['text'].fillna("").astype(str)

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = tokenizer.tokenize(text.lower())  # Tokenize and lowercase
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  # Stemming
    return ' '.join(stemmed_tokens)

# Apply preprocessing to text data
train_df['text'] = train_df['text'].apply(preprocess_text)
valid_df['text'] = valid_df['text'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(train_df['text'])
X_valid_tfidf = vectorizer.transform(valid_df['text'])

# Feature selection using Chi-Square
chi2_selector = SelectKBest(chi2, k=3000)
X_train_chi2 = chi2_selector.fit_transform(X_train_tfidf, train_df['sentiment'])
X_valid_chi2 = chi2_selector.transform(X_valid_tfidf)

# Train SVM model
svm_model = SVC(kernel='linear')  # You can change the kernel (e.g., 'rbf', 'poly')
svm_model.fit(X_train_chi2, train_df['sentiment'])

# Define the directory where you want to save the files
save_directory = '/content/drive/MyDrive/Data_Mining/myModel'

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Save the Model, Vectorizer, and Chi-Square Selector
with open(os.path.join(save_directory, 'svm_model.pkl'), 'wb') as model_file:
    pickle.dump(svm_model, model_file)

with open(os.path.join(save_directory, 'vectorizer.pkl'), 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

with open(os.path.join(save_directory, 'chi2_selector.pkl'), 'wb') as chi2_file:
    pickle.dump(chi2_selector, chi2_file)

print(f"Model and associated files saved in: {save_directory}")

# Evaluate the model
y_pred = svm_model.predict(X_valid_chi2)
print("Classification Report:")
print(classification_report(valid_df['sentiment'], y_pred))
print("Accuracy:", accuracy_score(valid_df['sentiment'], y_pred))

# Function to predict sentiment for new text
def predict_sentiment(text):
    # Load the saved model, vectorizer, and chi-square selector
    with open(os.path.join(save_directory, 'svm_model.pkl'), 'rb') as model_file:
        svm_model = pickle.load(model_file)

    with open(os.path.join(save_directory, 'vectorizer.pkl'), 'rb') as vec_file:
        vectorizer = pickle.load(vec_file)

    with open(os.path.join(save_directory, 'chi2_selector.pkl'), 'rb') as chi2_file:
        chi2_selector = pickle.load(chi2_file)

    # Preprocess the input text
    processed_text = preprocess_text(text)
    features = vectorizer.transform([processed_text])
    features = chi2_selector.transform(features)

    # Predict sentiment
    prediction = svm_model.predict(features)

    # Map numerical prediction back to sentiment label
    sentiment_mapping_reverse = {0: "Negative", 1: "Irrelevant", 2: "Neutral", 4: "Positive"}
    return sentiment_mapping_reverse[prediction[0]]

# Test the prediction function
sample_tweet = "I love this product! Highly recommended."
# sample_tweet = "I hate this product"
print("Sample Tweet Prediction:", predict_sentiment(sample_tweet))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training Data Target Values: [4 2 0 1]
Validation Data Target Values: [1 2 0 4]
Model and associated files saved in: /content/drive/MyDrive/Data_Mining/myModel
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.88      0.81       266
           1       0.80      0.72      0.75       172
           2       0.83      0.71      0.76       285
           4       0.77      0.81      0.79       277

    accuracy                           0.78      1000
   macro avg       0.79      0.78      0.78      1000
weighted avg       0.79      0.78      0.78      1000

Accuracy: 0.783
Sample Tweet Prediction: Positive


In [6]:
import pickle
import os
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')

# Define the directory where the model and associated files are saved
save_directory = '/content/drive/MyDrive/Data_Mining/myModel'

# Text preprocessing function (must match the one used during training)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()

    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = tokenizer.tokenize(text.lower())  # Tokenize and lowercase
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  # Stemming
    return ' '.join(stemmed_tokens)

# Function to predict sentiment for new text
def predict_sentiment(text):
    # Load the saved model, vectorizer, and chi-square selector
    with open(os.path.join(save_directory, 'svm_model.pkl'), 'rb') as model_file:
        svm_model = pickle.load(model_file)

    with open(os.path.join(save_directory, 'vectorizer.pkl'), 'rb') as vec_file:
        vectorizer = pickle.load(vec_file)

    with open(os.path.join(save_directory, 'chi2_selector.pkl'), 'rb') as chi2_file:
        chi2_selector = pickle.load(chi2_file)

    # Preprocess the input text
    processed_text = preprocess_text(text)
    features = vectorizer.transform([processed_text])
    features = chi2_selector.transform(features)

    # Predict sentiment
    prediction = svm_model.predict(features)

    # Map numerical prediction back to sentiment label
    sentiment_mapping_reverse = {0: "Negative", 1: "Irrelevant", 2: "Neutral", 4: "Positive"}
    return sentiment_mapping_reverse[prediction[0]]

# Test the prediction function with a new sample tweet
sample_tweet = "This movie was fantastic! I really enjoyed it."
# sample_tweet = "I had a terrible experience with this product."
print("Sample Tweet Prediction:", predict_sentiment(sample_tweet))

Sample Tweet Prediction: Positive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
