# Sentiment Analysis

Natural Language Processing (NLP) is a subset of Artificial Intelligence where we aim to train computers to understand human languages.

### Import Libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [2]:
df = pd.read_csv(r"C:\Users\ASUS\Downloads\archive (12)\IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Sort the dataset

In [4]:
# Filter 2000 'positive' sentiment rows
positive_samples = df[df['sentiment'] == 'positive'].sample(n=2000, random_state=1)

# Filter 2000 'negative' sentiment rows
negative_samples = df[df['sentiment'] == 'negative'].sample(n=2000, random_state=1)

# Combine the two datasets
final_samples = pd.concat([positive_samples, negative_samples])

# Shuffle the combined dataset if necessary
final_samples = final_samples.sample(frac=1, random_state=1).reset_index(drop=True)

# Save or return the reduced dataset
final_samples.to_csv('reduced_dataset.csv', index=False)

# Print out to check
print(final_samples.head())


                                              review sentiment
0  As a longtime admirer of the 2001 film "Moulin...  positive
1  I'm a writer working at home and Diagnosis Mur...  positive
2  I've read one comment which labeled this film ...  positive
3  This film is enjoyable if you like poverty row...  negative
4  In the opening scene, the eye patch wearing de...  positive


In [5]:
final_samples.shape

(4000, 2)

In [6]:
import nltk
nltk.download('stopwords')
import re
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Preprocessing

In [7]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
df["review"] = df["review"].apply(clean)

### Data split into traning and testing

In [8]:
x = np.array(df["review"])
y = np.array(df["sentiment"])
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Model Traning

In [9]:
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier()
model.fit(X_train,y_train)

PassiveAggressiveClassifier()

# Model Testing | prediction

In [12]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)

Enter a Text: i like this movie
['positive']
