In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Part 1: Data Preprocessing

1.Load the Dataset

In [None]:
data = pd.read_csv("Netflix_Userbase.csv")

In [None]:
print(netflix_data.head())
print(netflix_data.info())

2.Handling missing values

In [None]:
print(data.isnull().sum())

3.Encode Categorical Variables

In [None]:
data_dummies = pd.get_dummies(data, drop_first=True)
data_dummies['Monthly Revenue']=(data['Monthly Revenue']>10).astype(int)

4.Feature Selection

In [None]:
X = data_dummies.drop(['Monthly Revenue','Age'], axis=1)
y = data_dummies['Monthly Revenue']

Part 2: Exploratory Data Analysis (EDA)

1. Descriptive Statistics


In [None]:
print(X.describe())

2. Visualizations

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data_dummies['Monthly Revenue'], bins=20, kde=True)
plt.title('Distribution of Monthly Revenue')
plt.xlabel('Monthly Revenue')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data['Age'], bins=20, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

Part 3: Linear Regression Model (Predicting Monthly Revenue)



1.Build the Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
y_pred = lr_model.predict(X_test)

2.Model Evaluation

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Model Evaluation:")
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

Part 4: Logistic Regression Model (Predicting Customer Feedback)

1.Model Building

In [None]:
data_dummies['Feedback'] = (data_dummies['Monthly Revenue'] > data_dummies['Monthly Revenue'].mean()).astype(int)

X_logistic = data_dummies.drop(['Monthly Revenue', 'Feedback'], axis=1)
y_logistic = data_dummies['Feedback']

In [None]:
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(X_logistic, y_logistic, test_size=0.2, random_state=42)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_logistic, y_train_logistic)

In [None]:
y_pred_logistic = logistic_model.predict(X_test_logistic)

2.Model Evaluation

In [None]:
accuracy = accuracy_score(y_test_logistic, y_pred_logistic)
precision = precision_score(y_test_logistic, y_pred_logistic)
recall = recall_score(y_test_logistic, y_pred_logistic)
f1 = f1_score(y_test_logistic, y_pred_logistic)
conf_matrix = confusion_matrix(y_test_logistic, y_pred_logistic)

print("Logistic Regression Model Evaluation:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Part 5: Comparative Analysis and Visualization

In [None]:
X_encoded = pd.get_dummies(X)


In [None]:
lr_model.fit(X_encoded, y)

In [None]:
lr_coef = lr_model.coef_
feature_importance_lr = pd.DataFrame({'Feature': X_encoded.columns, 'Coefficient': lr_coef})
print("Linear Regression Feature Importance:")
print(feature_importance_lr)

In [None]:
numerical_features = ['Age']
categorical_features = ['Gender']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
log_reg_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('logistic_regression', LogisticRegression())
])

In [None]:
log_reg_model = log_reg_pipeline.named_steps['logistic_regression']

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance_lr)
plt.title('Linear Regression Feature Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data = feature_importance_lr)
plt.title('Logistic Regression Feature Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
plt.show()