In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Global\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Global\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Global\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load the dataset (replace the file paths with your dataset location)
Sentiment_data = pd.read_csv("new_train_data_s140.csv", encoding='ISO-8859-1')

In [4]:
# Rename columns for easier access
columns = ['Polarity', 'Id', 'Date', 'Query', 'User', 'Text']
Sentiment_data.columns = columns

In [5]:
combined_data = pd.concat([Sentiment_data], ignore_index=True)

# Drop irrelevant columns
combined_data = combined_data.drop(['Id', 'Date', 'Query', 'User'], axis=1)

In [6]:
# Check for null values
print("Null values in Train Data:\n", Sentiment_data.isnull().sum())

Null values in Train Data:
 Polarity    0
Id          0
Date        0
Query       0
User        0
Text        0
dtype: int64


In [7]:
# Clean text function
def clean_text(text):
    # Remove URLs, mentions, hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization (reduce words to their base form)
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join the tokens back into a single string
    clean_text = ' '.join(lemmatized_text)
    return clean_text

In [8]:
# Split the dataset for training
X = combined_data['Text']  # Features (text data)
y = combined_data['Polarity']  # Labels (sentiment data)

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit the number of features for simplicity
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)  # Using X_train

# Predict on the validation set
y_pred = model.predict(X_val)  # Using X_val

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.79165625
Confusion Matrix:
 [[124598  34896]
 [ 31774 128732]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79    159494
           4       0.79      0.80      0.79    160506

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000

