In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("Mall_Customers.csv")
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(
    df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
)

In [None]:
sns.histplot(df['Age'], kde=True)
plt.show()

In [None]:
sns.boxplot(x=df['Annual Income (k$)'])
plt.show()

In [None]:
sns.scatterplot(
    x=df['Annual Income (k$)'],
    y=df['Spending Score (1-100)']
)
plt.show()

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X = df[['Age', 'Annual Income (k$)']]
y = df['Spending Score (1-100)']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test, y_pred)

In [None]:
df['High_Spender'] = df['Spending Score (1-100)'].apply(
    lambda x: 1 if x > 50 else 0
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

X = df[['Age', 'Annual Income (k$)']]
y = df['High_Spender']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_data)

In [None]:
plt.scatter(
    df['Annual Income (k$)'],
    df['Spending Score (1-100)'],
    c=df['Cluster']
)
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")
plt.show()