In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

In [2]:
df = pd.read_csv("twitter_training.csv", header=None)
df.columns = ["ID", "Account", "Sentiment", "Statement"]

In [3]:
df

Unnamed: 0,ID,Account,Sentiment,Statement
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
df.duplicated().sum()

2700

In [5]:
df = df.drop_duplicates()

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

ID             0
Account        0
Sentiment      0
Statement    326
dtype: int64

In [8]:
df.dtypes

ID            int64
Account      object
Sentiment    object
Statement    object
dtype: object

In [9]:
df["Statement"].mode()[0]

' '

In [10]:
df["Statement"] = df["Statement"].fillna(df["Statement"].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Statement"] = df["Statement"].fillna(df["Statement"].mode()[0])


In [11]:
df.isnull().sum()

ID           0
Account      0
Sentiment    0
Statement    0
dtype: int64

In [12]:
df.duplicated().sum()

172

In [13]:
df = df.drop_duplicates()

In [14]:
df.duplicated().sum()

0

In [15]:
df.isnull().sum()

ID           0
Account      0
Sentiment    0
Statement    0
dtype: int64

In [16]:
df

Unnamed: 0,ID,Account,Sentiment,Statement
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [17]:
df.to_csv("cleaned_twitter_training.csv")

In [18]:
df = pd.read_csv("cleaned_twitter_training.csv", index_col=0)

In [19]:
df

Unnamed: 0,ID,Account,Sentiment,Statement
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [20]:
df.dtypes

ID            int64
Account      object
Sentiment    object
Statement    object
dtype: object

In [21]:
#Clean the Text Data

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the text column
df['cleaned_text'] = df['Statement'].apply(clean_text)

In [22]:
#Remove Stopwords

# Download the stopwords from NLTK
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the function to remove stopwords
df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
#Tokenize the Text

# Download necessary NLTK data for tokenization
nltk.download('punkt')

def tokenize_text(text):
    return word_tokenize(text)

# Apply the function to tokenize the text
df['tokens'] = df['cleaned_text'].apply(tokenize_text)

# Display the dataframe with the original text, cleaned text, and tokens
print(df[['Statement', 'cleaned_text', 'tokens']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                           Statement  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

                    cleaned_text                              tokens  
0  im getting borderlands murder  [im, getting, borderlands, murder]  
1            coming borders kill             [coming, borders, kill]  
2    im getting borderlands kill    [im, getting, borderlands, kill]  
3   im coming borderlands murder   [im, coming, borderlands, murder]  
4  im getting borderlands murder  [im, getting, borderlands, murder]  


In [24]:
#Convert Text Data to TF-IDF Features

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100000)

# Fit and transform the cleaned text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

In [25]:
#Split the Data

X = tfidf_matrix  # Use the sparse matrix directly
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# Standardize the data

scaler = StandardScaler(with_mean=False)  # with_mean=False to keep the sparse format
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
#Train and Evaluate Models

# Train the neural network model with scaled data
nn_model = MLPClassifier(hidden_layer_sizes=(500,), max_iter=500)
nn_model.fit(X_train_scaled, y_train)

# Make predictions
nn_predictions = nn_model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, nn_predictions)
precision = precision_score(y_test, nn_predictions, average='weighted')
recall = recall_score(y_test, nn_predictions, average='weighted')
f1 = f1_score(y_test, nn_predictions, average='weighted')

# Print the metrics
print("Neural Network Model Evaluation:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Print detailed classification report
print("Classification Report:\n", classification_report(y_test, nn_predictions))


Neural Network Model Evaluation:
Accuracy: 0.8773151371675254
Precision: 0.8779456425256416
Recall: 0.8773151371675254
F1 Score: 0.8774745832729801
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.85      0.86      0.86      2516
    Negative       0.91      0.89      0.90      4352
     Neutral       0.88      0.87      0.88      3494
    Positive       0.85      0.88      0.87      4000

    accuracy                           0.88     14362
   macro avg       0.88      0.88      0.87     14362
weighted avg       0.88      0.88      0.88     14362



In [29]:
joblib.dump(nn_model, 'neural_network_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']