In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


## Importing depecdencies

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split                # Split data into train and test data
from tensorflow.keras.models import Sequential                      # build sequential layer of neural network
from tensorflow.keras.layers import Dense, Embedding, LSTM          # 
from tensorflow.keras.preprocessing.text import Tokenizer           # to map words to intergers or vectors
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to ensure all data is in same shape
from tensorflow.keras.layers import Input

KeyboardInterrupt: 

In [None]:
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
data.head()

In [None]:
data['sentiment'].value_counts()

In [None]:
# Let's observe distribution of positive / negative sentiments in dataset

import seaborn as sns
sns.countplot(x='sentiment', data=data)

In [None]:
data.replace({'sentiment': {'positive':1,'negative':0}}, inplace=True)

## Split data into train and test data

In [None]:
# split data into training data and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
print(train_data.shape)
print(test_data.shape)

## Data Processing

In [None]:
# Tokenizer Data
# tokenizer is used to  words into intergers, we take most common 5000 words and conveert words into numbers
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200) # pad_sequence makes sure input length remaing constant
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [None]:
print(Y_train.dtype)  # Should be int
print(Y_test.dtype)   # Should be int

In [None]:
print(Y_train)

# Building LSTM Model

In [None]:
# Define the model
model = Sequential()
model.add(Input(shape=(200,)))                                           # Specify input shape
model.add(Embedding(input_dim=5000, output_dim=128))                     # 1st layer, 5000 = tokenizer number of words , each word should be in 128 dimention vector space, 200 = max length pad sequence
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))                 # 2nd layer, 128 =  neurons, dropout = 20% data will be 0 to reduce overfitting, recc = do not overfit
model.add(Dense(1, activation="sigmoid"))                                # 3rd layer, dense = all neurons in previous layer connected to all neurons in dense layer, sigmoid= this is binary classification

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Training model

In [None]:
history = model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))

# Model Evalution

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

## Building a Predictive System

In [None]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
# example usage
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

In [None]:
# example usage
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

In [None]:
# example usage
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

In [None]:
pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the XGBoost Regressor
xgb_model = xgb.XGBRegressor(random_state=42)

# Train the XGBoost model
xgb_model.fit(X_train, Y_train)

# Make predictions on the validation set
y_pred_val = xgb_model.predict(X_val)

# Evaluate the model's performance on the validation set
xgb_r2 = r2_score(y_val, y_pred_val)
xgb_mae = mean_absolute_error(y_val, y_pred_val)
xgb_mse = mean_squared_error(y_val, y_pred_val)

print(f"XGBoost Validation R² score: {xgb_r2}")
print(f"XGBoost Mean Absolute Error: {xgb_mae}")
print(f"XGBoost Mean Squared Error: {xgb_mse}")