<a href="https://colab.research.google.com/github/Anmol-14/Movie-Review-Model/blob/main/Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Import libraries
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import movie_reviews
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('movie_reviews')

# Load the NLTK movie reviews dataset
positive_reviews = [(movie_reviews.raw(fileid), 'positive') for fileid in movie_reviews.fileids('pos')]
negative_reviews = [(movie_reviews.raw(fileid), 'negative') for fileid in movie_reviews.fileids('neg')]
reviews = positive_reviews + negative_reviews

# Create a DataFrame
imdb_data = pd.DataFrame(reviews, columns=['review', 'sentiment'])

# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in set(nltk.corpus.stopwords.words('english'))]
    return ' '.join(words)

# Apply preprocessing to the dataset
imdb_data['review'] = imdb_data['review'].apply(preprocess_text)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(imdb_data['review'], imdb_data['sentiment'], test_size=0.2, random_state=42)

# Create TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')

# Save the model and vectorizer
with open('/content/drive/MyDrive/Colab Notebooks/sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('/content/drive/MyDrive/Colab Notebooks/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Model Accuracy: 0.7725


In [6]:
# Get feature names from the TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get feature importances from the trained model
feature_importances = model.feature_importances_

# Create a DataFrame with feature names and importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the top features
print(importance_df.head())