In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def load_dataset(file_path):
    try:
        df = pd.read_csv(file_path, encoding='latin1')
        return df
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        raise
    except pd.errors.EmptyDataError:
        print("Error: File is empty.")
        raise
    except pd.errors.ParserError:
        print("Error: File could not be parsed.")
        raise

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join([word for word in text.split() if word not in stop_words])

def preprocess_data(df):
    # Print column names to check for mismatches
    print("Columns in the dataset:", df.columns.tolist())

    # Ensure correct column names
    if 'content' not in df.columns or 'sentiment' not in df.columns:
        raise KeyError("Required columns are missing from the dataset.")
    
    df['content'] = df['content'].fillna('')
    df['sentiment'] = df['sentiment'].fillna('neutral')  # Assuming 'neutral' is a valid sentiment
    df['sentiment'] = df['sentiment'].astype(str)
    df['content'] = df['content'].apply(clean_text)
    return df

def train_model(X_train, y_train):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train).toarray()

    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_train_one_hot = to_categorical(y_train_encoded)

    model = Sequential()
    model.add(Dense(512, input_dim=X_train_vec.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(y_train_one_hot.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Adding EarlyStopping to avoid overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    model.fit(X_train_vec, y_train_one_hot, epochs=10, batch_size=32, validation_split=0.1, callbacks=[early_stopping], verbose=1)

    return model, vectorizer, label_encoder

def evaluate_model(model, vectorizer, label_encoder, X_test, y_test):
    X_test_vec = vectorizer.transform(X_test).toarray()
    
    y_test_encoded = label_encoder.transform(y_test)
    y_test_one_hot = to_categorical(y_test_encoded)

    y_pred_prob = model.predict(X_test_vec)
    y_pred = np.argmax(y_pred_prob, axis=1)

    accuracy = accuracy_score(np.argmax(y_test_one_hot, axis=1), y_pred)
    report = classification_report(np.argmax(y_test_one_hot, axis=1), y_pred, target_names=label_encoder.classes_)

    return accuracy, report

def predict_sentiment(model, vectorizer, label_encoder, text):
    text = clean_text(text)
    text_vec = vectorizer.transform([text]).toarray()
    text_pred_prob = model.predict(text_vec)
    text_pred = np.argmax(text_pred_prob, axis=1)
    sentiment = label_encoder.inverse_transform(text_pred)
    return sentiment[0]

def main(file_path):
    df = load_dataset(file_path)
    df = preprocess_data(df)

    X = df['content']
    y = df['sentiment']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("Training data shape:", X_train.shape)
    print("Testing data shape:", X_test.shape)
    
    model, vectorizer, label_encoder = train_model(X_train, y_train)
    accuracy, report = evaluate_model(model, vectorizer, label_encoder, X_test, y_test)
    
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")
    
    print("\nLabel encoding mapping:")
    for index, label in enumerate(label_encoder.classes_):
        print(f"{index}: {label}")
    
    while True:
        user_input = input("Enter a message to analyze sentiment (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        sentiment = predict_sentiment(model, vectorizer, label_encoder, user_input)
        print(f"The sentiment of the message is: {sentiment.capitalize()}")

if __name__ == "__main__":
    main('Sentiment_Analysis.csv')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Columns in the dataset: ['tweet_id', 'sentiment', 'author', 'content']
Training data shape: (32000,)
Testing data shape: (8000,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 24ms/step - accuracy: 0.2603 - loss: 2.1154 - val_accuracy: 0.3356 - val_loss: 1.9315
Epoch 2/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.4053 - loss: 1.7762 - val_accuracy: 0.3322 - val_loss: 1.9269
Epoch 3/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 22ms/step - accuracy: 0.4939 - loss: 1.5358 - val_accuracy: 0.3325 - val_loss: 1.9976
Epoch 4/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 22ms/step - accuracy: 0.5851 - loss: 1.2814 - val_accuracy: 0.3128 - val_loss: 2.1403
Epoch 5/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 22ms/step - accuracy: 0.6708 - loss: 1.0338 - val_accuracy: 0.3097 - val_loss: 2.3499
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.344125
Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.00      0.00      0.00       338
   happiness       0.30      0.43      0.35      1028
        hate       0.44      0.20      0.27       268
        love       0.40      0.45      0.42       762
     neutral       0.35      0.51      0.41      1740
      relief       0.43      0.01      0.02       352
     sadness       0.34      0.24      0.28      1046
    surprise       0.40      0.04      0.07       425
       worry       0.34      0.46      0.39      1666

    accuracy                           0.34      8000
   macro avg       0.23      0.18      0.17      8000
weighted avg       0.32      0.34      0.31      8000


Label encoding mapping:
0: anger
1: 

Enter a message to analyze sentiment (or type 'exit' to quit):  i am having math class today


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
The sentiment of the message is: Neutral
