In [None]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords package if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load your dataset (replace 'your_dataset.csv' with your file)
data = pd.read_csv('/IMDB Dataset.csv')  # Assuming CSV with 'review' and 'sentiment' columns
data


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
# Function to preprocess text
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    processed_text = ' '.join(filtered_tokens)
    return processed_text

# Apply preprocessing to all texts
data['processed_review'] = data['review'].apply(preprocess_text)

# Split the data into training and testing sets
text_train, text_test, label_train, label_test = train_test_split(data['processed_review'], data['sentiment'], test_size=0.2, random_state=42)

# Convert text to numerical features using TF-IDF
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(text_train)
X_test = tfidf.transform(text_test)

# Train a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train, label_train)

# Predict the labels for test data
predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(label_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(label_test, predictions)
print("Classification Report:")
print(report)

# Now, you can use this trained model to predict sentiment for new reviews
new_review = "Great service and fantastic experience!"
new_review_processed = preprocess_text(new_review)
new_review_vectorized = tfidf.transform([new_review_processed])
predicted_sentiment = classifier.predict(new_review_vectorized)
print(f"Predicted sentiment for '{new_review}': {predicted_sentiment[0]}")

Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Predicted sentiment for 'Great service and fantastic experience!': positive


In [None]:
new_review = "bad experience!"
new_review_processed = preprocess_text(new_review)
new_review_vectorized = tfidf.transform([new_review_processed])
predicted_sentiment = classifier.predict(new_review_vectorized)
print(f"Predicted sentiment for '{new_review}': {predicted_sentiment[0]}")

Predicted sentiment for 'bad experience!': negative


In [None]:
from joblib import dump
# Save the trained model
model_file = 'sentiment_analysis_model.joblib'
dump(classifier, model_file)
print(f"Trained model saved as {model_file}")

Trained model saved as sentiment_analysis_model.joblib


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
from joblib import load

# Download stopwords package if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load the trained model
model = load('sentiment_analysis_model.joblib')  # Change this to your model file path

# Function to preprocess text
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    processed_text = ' '.join(filtered_tokens)
    return processed_text

# Function to get sentiment prediction
def predict_sentiment(input_text):
    processed_input = preprocess_text(input_text)
    vectorized_input = tfidf.transform([processed_input])
    predicted_sentiment = model.predict(vectorized_input)
    return predicted_sentiment[0]

# Taking user input
user_input = input("Enter text to predict sentiment: ")
predicted_sentiment = predict_sentiment(user_input)
print(f"Predicted sentiment: {predicted_sentiment}")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter text to predict sentiment: it was fine
Predicted sentiment: positive
