In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem.snowball import SnowballStemmer
from spellchecker import SpellChecker  # Added spellchecker library

In [2]:
# Load the training data
path = 'trainingdata.csv'
df = pd.read_csv(path)
df = df.dropna()

In [3]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
# Spell Checker initialization
spell_checker = SpellChecker()

# Function for text preprocessing (including spell correction)
def preprocess_text(text):
    if text is None:
        return ''  # Return an empty string if input is None
    
    if not isinstance(text, str):
        return str(text)  # Convert non-string types to string
    
    # Tokenize the text into words
    words = text.split()
    
    # Spell correction for each word
    corrected_words = [spell_checker.correction(word) for word in words]
    
    # Join the corrected words back into a sentence
    corrected_text = ' '.join(corrected_words)
    
    return corrected_text

In [5]:
# Define multiple pipelines with different classifiers
classifiers = {
    'SVM': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42))
    ]),
    'RandomForest': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf-rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ]),
    'NaiveBayes': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf-nb', MultinomialNB())
    ]),
    'SVM_RBF': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf-svm', SVC(kernel='rbf', gamma='auto', random_state=42))
    ]),
}


In [6]:
# Train and evaluate each classifier
best_model = None
best_accuracy = 0.0

for name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(train_data['sentences'], train_data['sentiments'])

    # Evaluate accuracy on the test set
    predicted = classifier.predict(test_data['sentences'])
    accuracy = accuracy_score(test_data['sentiments'], predicted)
    print(f"Accuracy for {name}: {accuracy * 100:.2f}%")

    # Display classification report for more detailed evaluation
    print(f"Classification Report for {name}:\n{classification_report(test_data['sentiments'], predicted)}")

    # Update best model if accuracy is higher
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = classifier


Accuracy for SVM: 86.59%
Classification Report for SVM:
              precision    recall  f1-score   support

        -1.0       0.89      0.91      0.90       111
         0.0       0.88      0.54      0.67        41
         1.0       0.84      0.94      0.89       109

    accuracy                           0.87       261
   macro avg       0.87      0.80      0.82       261
weighted avg       0.87      0.87      0.86       261

Accuracy for RandomForest: 78.93%
Classification Report for RandomForest:
              precision    recall  f1-score   support

        -1.0       0.83      0.78      0.81       111
         0.0       0.72      0.51      0.60        41
         1.0       0.77      0.90      0.83       109

    accuracy                           0.79       261
   macro avg       0.77      0.73      0.75       261
weighted avg       0.79      0.79      0.78       261

Accuracy for NaiveBayes: 80.84%
Classification Report for NaiveBayes:
              precision    recall  f1-

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Function to predict sentiment using the best model
def predict_sentiment(sentence):
    result = best_model.predict([sentence])[0]
    return result

In [8]:
# Function to interactively enter sentences and check sentiment
def enter_sentences():
    while True:
        user_input = input("Enter a sentence (or 'exit' to stop): ")
        if user_input.lower() == 'exit':
            break
        predicted_sentiment = predict_sentiment(user_input)
        predicted_sentiment_label = sentiment_labels[predicted_sentiment]
        print(f"Predicted Sentiment: {predicted_sentiment_label}")

# Map the numerical sentiment to labels
sentiment_labels = {1: 'Positive', 0: 'Neutral', -1: 'Negative'}


In [None]:
# Test the function with interactive sentence entry
enter_sentences()

# Print the best model's accuracy on the test set
print(f"Best Model Accuracy on the test set: {best_accuracy * 100:.2f}%")

Enter a sentence (or 'exit' to stop): She is good in teaching
Predicted Sentiment: Positive
Enter a sentence (or 'exit' to stop): She is average in teaching
Predicted Sentiment: Neutral
Enter a sentence (or 'exit' to stop): She is not good in teaching
Predicted Sentiment: Negative
