In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import datetime as dt
from scipy.stats import stats

In [3]:
data = pd.read_csv("movies new CLEAN.csv")
data.drop(columns=['Unnamed: 3',"Unnamed: 6","date","verifed","reviewID"],axis=1,inplace=True)

In [4]:
data.dropna(inplace=True)

In [5]:
data['vote']=data["vote"].replace(" ",None)
data['vote']=pd.to_numeric(data["vote"])
data['vote']=data['vote'].fillna(data['vote'].mean())
data['vote']=data['vote'].astype("int64")
data=data.sample(n=10000,random_state=124)

In [6]:
data["format"]=data["format"].str.strip()

In [7]:
data=data.drop(data[data['format'].isin(["Paperback","UMD for","Bernie (xyzzy)","Kitchen","Hardcover"])].index,axis=0)

In [8]:
data=data.drop_duplicates()
data=data[data['reviewerName'] != "reviewerName"]

In [9]:
data.reset_index(drop=True,inplace=True)

In [10]:
import re
import contractions
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
lm=WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aazar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
stopwords=nltk.corpus.stopwords.words("english")
stopwords=set(stopwords)-{"not"}
stopwords.update({"was"})

In [12]:

import re
from nltk.tokenize import word_tokenize

def lower(x):
    x = x.lower()
    return x

def strip(x):
    x = x.strip()
    return x

def expand_contraction(x):
    x = contractions.fix(x)  # Assuming you have a contractions dictionary or module imported
    return x

def remove_special_char_and_digit(x):
    x = re.sub(r' +', ' ', x)  # Replace multiple spaces with a single space
    x = re.sub(r"[^\w\s]|[\d]", " ", x)  # Remove non-word characters and digits
    x = re.sub("\s{2,}", " ", x)  # Remove extra spaces
    x = re.sub(r"[-()\"#/@;:{}`+=~|.!?\*&£%€¦_><‘|,'0-9]", " ", x)  # Remove specific punctuation
    x = re.sub(r"\b\w+\d+\b"," ",x)  # Remove alphanumeric words
    return x.strip()  # Strip leading/trailing spaces

def tokenize_and_clean(x):
    tokens = word_tokenize(x)  # Tokenize the text
    cleaned_tokens = [remove_special_char_and_digit(token.lower()) for token in tokens]  # Clean each token
    cleaned_text = " ".join(cleaned_tokens)  # Join cleaned tokens into a single string
    return cleaned_text

def lemm(x):
    # Assuming lm (lemmatizer) is imported correctly
    x = [lm.lemmatize(word) for word in word_tokenize(x)]  # Lemmatize each word
    return " ".join(x)  # Join lemmatized words into a single string

def repeated(x):
    words = x.split()  # Split text into words
    unique_words = set(words)  # Get unique words
    clean_text = " ".join(unique_words)  # Join unique words into a single string
    return clean_text

def stop(x):
    # Assuming stopwords list is provided as a parameter
    return " ".join([word for word in word_tokenize(x) if word.lower() not in stopwords])  # Remove stopwords


In [13]:
def clean_data(x):
    x=strip(x)
    x=expand_contraction(x)
    x=remove_special_char_and_digit(x)
    x=tokenize_and_clean(x)
    x=lemm(x)
    x=repeated(x)
    x=stop(x)
    return x

In [14]:
data["Review Title Clean"]=data['Review Title'].apply(lambda y: clean_data(y))

In [15]:
stopwords.update({"wa","b","c","ha","d","e","f","j","k","l","m","n","o","p","q","r","s","t","u",
                 "v","x","y","z","g","h","i","ehhh"})

In [16]:
data["Review Title Clean"]=data["Review Title Clean"].apply(stop)

In [17]:
data.drop(columns=["Review Title"],axis=1,inplace=True)

In [18]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [19]:
se=SentimentIntensityAnalyzer()
data['sentiment_score']=data["Review Title Clean"].apply(lambda x: se.polarity_scores(x)["compound"])

In [20]:
def sentiment(x):
    if x>0.1:
        return "positive"
    elif x<0:
        return "negative"
    else:
        return "neutral"

In [21]:
data['sentiment_cat']=data["sentiment_score"].apply(sentiment)

In [22]:
data1=data[["format","overall","Review Title Clean","sentiment_cat"]]
data1.columns=["format","rating","review_text","final_review"]

In [23]:
data1.to_csv("clean_data_10000_record.csv")

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from scipy.sparse import hstack,csr_matrix
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [25]:
data1["format"].unique()

array(['Amazon Video', 'DVD', 'Blu-ray', 'VHS Tape', 'HD DVD',
       'MP3 Music', 'Audio CD'], dtype=object)

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
le=LabelEncoder()
data1["format"]=le.fit_transform(data1['format'])

In [30]:
from joblib import dump
import pickle

In [31]:
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [42]:
data1["format"].unique()

array([0, 3, 2, 6, 4, 5, 1])

In [44]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=1000)
x_text = vectorizer.fit_transform(data1['review_text']).toarray()

# Categorical encoding

X = np.concatenate([x_text, data1[["format"]].values, data1[['rating']].values], axis=1)

array([[0.0, 0.0, 0.0, ..., 0.0, 0, '4'],
       [0.0, 0.324064942040335, 0.0, ..., 0.0, 3, '5'],
       [0.0, 0.0, 0.0, ..., 0.0, 0, '3'],
       ...,
       [0.0, 0.0, 0.0, ..., 0.0, 0, '5'],
       [0.0, 0.0, 0.0, ..., 0.0, 3, '5'],
       [0.0, 0.0, 0.0, ..., 0.0, 0, '4']], dtype=object)

In [46]:
def map_(x):
    if x=="positive":
        return 1
    elif x=="negative":
        return 2
    else:
        return 0

In [47]:
data1["final_review"]=data1["final_review"].apply(map_)
Y=data1['final_review']

In [48]:
Y.value_counts()

final_review
1    7329
2    1217
0     930
Name: count, dtype: int64

In [49]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((7580, 1002), (1896, 1002), (7580,), (1896,))

In [50]:
xg=xgb.XGBClassifier(random_state=123)

In [51]:
xg.fit(x_train,y_train)

In [52]:
predict=xg.predict(x_test)
acc=accuracy_score(y_test,predict)
acc

0.8612869198312236

In [65]:
import pickle
import joblib

In [66]:
from joblib import dump

In [70]:
dump(xg, 'model1.joblib')

['model1.joblib']

In [71]:
dump(vectorizer, 'vectozier.joblib')

['vectozier.joblib']

In [62]:
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xg, f)

In [64]:
with open('TDIDF_VECTOZIER.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [72]:
from joblib import dump, load

def save_model_and_vectorizer(model, vectorizer):
    dump(model, 'model1.joblib')
    dump(vectorizer, 'vectozier.joblib')

# Example usage
# save_model_and_vectorizer(model, vectorizer)


In [75]:
def save_text_data(text_data, file_path):
    # Open the file in write mode with UTF-8 encoding
    with open(file_path, 'w', encoding='utf-8') as file:
        # Write the text data to the file
        file.write(text_data)

# Example usage
text_data = "model1.joblib"  # The text you want to save (could be a file name or description)
save_text_data(text_data, 'metadata.txt')  # Save the text data to 'metadata.txt'


In [77]:
# Save the joblib file path to a text file
def save_metadata(file_path, metadata_path):
    with open(metadata_path, 'w', encoding='utf-8') as file:
        file.write(file_path)

# Example usage
joblib_file_path = 'model1.joblib'  # Path to the .joblib file
metadata_file_path = 'metadata.txt'  # Path to save the metadata
save_metadata(joblib_file_path, metadata_file_path)


In [78]:
import joblib

# Read the file path from the metadata file and load the model
def load_model(metadata_path):
    with open(metadata_path, 'r', encoding='utf-8') as file:
        joblib_file_path = file.read().strip()  # Read and strip any extra whitespace/newlines
    model = joblib.load(joblib_file_path)
    return model

# Example usage
metadata_file_path = 'metadata.txt'  # Path to the metadata file
model = load_model(metadata_file_path)

# Use the model
print(model)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)


In [79]:
import joblib

# Save the joblib file path to a text file
def save_metadata(file_path, metadata_path):
    with open(metadata_path, 'w', encoding='utf-8') as file:
        file.write(file_path)

# Example usage
vectorizer_file_path = 'vectozier.joblib'  # Path to the vectorizer .joblib file
metadata_file_path = 'vectorizer_metadata.txt'  # Path to save the metadata
save_metadata(vectorizer_file_path, metadata_file_path)


In [80]:
import joblib

# Read the file path from the metadata file and load the vectorizer
def load_vectorizer(metadata_path):
    with open(metadata_path, 'r', encoding='utf-8') as file:
        vectorizer_file_path = file.read().strip()  # Read and strip any extra whitespace/newlines
    vectorizer = joblib.load(vectorizer_file_path)
    return vectorizer

# Example usage
metadata_file_path = 'vectorizer_metadata.txt'  # Path to the metadata file
vectorizer = load_vectorizer(metadata_file_path)

# Use the vectorizer
print(vectorizer)


TfidfVectorizer(max_features=1000, min_df=5, ngram_range=(1, 2))


In [82]:
import streamlit as st
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer



st.title('Sentiment Analysis Prediction')

text = st.text_area('Enter your text here:')
format_option = st.selectbox('Select format:', ['Amazon Video', 'DVD', 'Blu-ray', 'VHS Tape', 'HD DVD', 'MP3 Music', 'Audio CD'])
rating = st.slider('Select rating:', 1, 5)

# Fit LabelEncoder on all possible formats
all_possible_formats = ['Amazon Video', 'DVD', 'Blu-ray', 'VHS Tape', 'HD DVD', 'MP3 Music', 'Audio CD']
le = LabelEncoder()
le.fit(all_possible_formats)

def preprocess_text(text):
    # Read text with ANSI encoding and convert it to UTF-8
    return text.encode('latin1').decode('utf-8')

def predict_sentiment(text, format_option, rating, model, vectorizer):
    try:
        # Preprocess the text
        text = preprocess_text(text)
        
        # Transform the text data
        text_features = vectorizer.transform([text]).toarray()
        
        # Encode the categorical feature using LabelEncoder
        format_encoded = le.transform([format_option]).reshape(1, -1)
        
        # Combine the features
        features = np.concatenate([text_features, format_encoded, np.array([[rating]], dtype=float)], axis=1)
        
        # Predict the sentiment
        prediction = model.predict(features)
        predicted_label = prediction[0]
        
        return predicted_label
    except UnicodeEncodeError as e:
        st.error(f"Encoding error: {e}")
    except Exception as e:
        st.error(f"An unexpected error occurred: {e}")

if st.button("Predict Sentiment"):
    if text:
        try:
            sentiment = predict_sentiment(text, format_option, rating, model, vectorizer)
            st.write(f"Predicted Sentiment: {sentiment}")
        except UnicodeEncodeError as e:
            st.error(f"Encoding error: {e}")
        except Exception as e:
            st.error(f"An unexpected error occurred: {e}")
    else:
        st.write("Please enter text for prediction.")

2024-07-31 00:35:50.081 
  command:

    streamlit run C:\Users\aazar\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-07-31 00:35:50.081 Session state does not function when running a script without `streamlit run`


In [87]:
print(f"Number of features used for training: {X.shape[1]}")

Number of features used for training: 1002


In [88]:
print(f"Number of features in input data: {text_features.shape[1]}")


NameError: name 'text_features' is not defined