### 1. Collecting the data and loading the data

In [4]:
import pandas as pd

# Loading the dataset
df = pd.read_csv('/IMDB Dataset.csv')

# Display the first few rows and last few rows and check the columns
print(df.head())
print(df.tail())
print(df.columns)



                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
                                                  review sentiment
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative
Index(['review', 'sentiment'], dtype='object')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Data analytics on IMDB MOVIE REVIEWS

### 2. Understanding the dataset

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Vectorizing the text data
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['review'])

# Converting sentiment labels to binary labels (0 for negative, 1 for positive)
y = df['sentiment'].map({'negative': 0, 'positive': 1})

# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


### 3. IMPLEMENTATION OF LOGISTIC REGRESSION

In [6]:
from sklearn.linear_model import LogisticRegression

# Initializing Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Training the model
lr_model.fit(X_train, y_train)



### 5. Creating model and checking accuracy score

In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test data
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.89


### 4. PROPER VISUALISATION

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Example data
data = pd.read_csv('/content/IMDB Dataset.csv')
df = pd.DataFrame(data)

# Splitting data
X = df['review']
y = df['sentiment']

# Vectorizing the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Visualization
plt.figure(figsize=(10, 6))

plt.subplot(2, 2, 1)
plt.hist(X_train.sum(axis=1).A1, bins=30, color='blue', alpha=0.7)
plt.title('Distribution of Non-zero Counts in X_train')
plt.xlabel('Number of Non-zero Counts')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(X_test.sum(axis=1).A1, bins=30, color='green', alpha=0.7)
plt.title('Distribution of Non-zero Counts in X_test')
plt.xlabel('Number of Non-zero Counts')
plt.ylabel('Frequency')

plt.subplot(2, 2, 3)
sns.countplot(x=y_train, palette='pastel', hue=None, legend=False)  # Using seaborn for countplot
plt.title('Distribution of y_train')
plt.xlabel('Sentiment')
plt.ylabel('Count')

plt.subplot(2, 2, 4)
sns.countplot(x=y_test, palette='pastel', hue=None, legend=False)  # Using seaborn for countplot
plt.title('Distribution of y_test')
plt.xlabel('Sentiment')
plt.ylabel('Count')

plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/content/IMDB Dataset.csv'