<a href="https://colab.research.google.com/github/A-Burnhard/Mall-Customer-Segmentation/blob/main/Mall_Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Loading or importing  wine dataset to notebook
cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv("iris.data", names=cols)
iris_data.head()

**Preprocessing**


**Identifying Missing values**

In [None]:
# Identify missing values
missing_values = iris_data.isnull().sum()
print("Missing values:\n", missing_values)

**Identifying Duplicates**

In [None]:
# Identify duplicate rows
duplicates = iris_data.duplicated()
print("Duplicates instances: \n",duplicates)

**Outlier Detection using Boxplot**

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = iris_data

# Visualize the distribution of each feature using box plots
plt.figure(figsize=(10, 6))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.title('Boxplot of Features')
plt.show()

# Identify outliers using statistical methods (e.g., Z-score or IQR)
# Z-score method
from scipy.stats import zscore

data = iris_data.drop("class", axis=1)
z_scores = zscore(data)
outlier_threshold = 3  # Adjust the threshold as per your preference
outliers = (abs(z_scores) > outlier_threshold).any(axis=1)

# IQR method
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)

# Count the number of outliers
num_outliers = outliers.sum()
print(f"Number of outliers: {num_outliers}")

# Decide whether to remove outliers or transform them
remove_outliers = False

if remove_outliers:
    # Remove outliers from the dataset
    data = data[~outliers]
    print("Outliers removed.")
else:
    # Transform outliers to a specific value
    outlier_value = 8  # Choose an appropriate value for transformation
    data[outliers] = outlier_value
    print("Outliers transformed.")

# Updated visualization after handling outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.title('Boxplot of Features (After Outlier Handling)')
plt.show()

**Defining X and Y values**

In [None]:
# Separate the target variable (class) from the features
iris_data = pd.read_csv("iris.data", names=cols)

X = iris_data.drop('class', axis=1)
y = iris_data['class']

# Convert the target variable to numeric labels
label_mapping = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
y = y.map(label_mapping)

**Influencial datapoint detection using leverage and cooks distance**

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence





# Add a constant term to the features matrix for the intercept in the linear regression model
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X)
results = model.fit()

# Get the influence statistics
influence = OLSInfluence(results)

# Calculate the leverage values
leverage = influence.hat_matrix_diag

# Calculate the Cook's distance
cooks_distance = influence.cooks_distance

# Identify influential data points based on leverage or Cook's distance
influential_points_leverage = leverage > 2 * (X.shape[1] + 1) / X.shape[0]
influential_points_cooks = cooks_distance[0] > 4 / (X.shape[0] - X.shape[1] - 1)

# Print the influential data points
print("Influential points based on leverage:")
print(X[influential_points_leverage])

print("\nInfluential points based on Cook's distance:")
print(X[influential_points_cooks])


**Normality of the set of features using shapiro**

In [None]:
import pandas as pd
from scipy.stats import shapiro



# Select the features to check for normality
features = X

# Perform Shapiro-Wilk test for each feature
for column in features.columns:
    stat, p_value = shapiro(features[column])
    alpha = 0.05  # Significance level

    print(f"Feature: {column}")
    print(f"Shapiro-Wilk test statistic: {stat}")
    print(f"P-value: {p_value}")

    if p_value > alpha:
        print("Feature appears to be normally distributed.")
    else:
        print("Feature does not appear to be normally distributed.")

    print()


**Data transformation**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
X,y

# Separate the target variable (class) from the features

# Perform normalization using Min-Max scaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Perform standardization using StandardScaler
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
print(X)

**Feature Selection**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# Create a Random Forest regressor
rf = RandomForestRegressor()

# Fit the Random Forest model
rf.fit(X, y)

# Get the feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame of feature importances
feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the features by importance (descending order)
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(feature_importances_df)

**Oversampling techniques using the Synthetic Minority Over-sampling Technique (SMOTE) to balance the imbalanced dataset**

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE


# Create a SMOTE object
smote = SMOTE()

# Perform oversampling
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print the balanced class distribution
print("Class distribution after SMOTE:")
print(y_resampled.value_counts())


**Selecting Appropriate Learners for Training and Validation (K means and agglomerative clustering)**

**K-Means clustering with K-fold cross-validation**

In [None]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score

# Load the Iris dataset
iris = load_iris()
X = iris.data

# Instantiate the KMeans model
kmeans = KMeans(n_clusters=3)

# Perform K-fold cross-validation
k_fold_scores = cross_val_score(kmeans, X, cv=5)

# Print the cross-validation scores
print("K-Fold Cross-Validation Scores for K-Means:")
print(k_fold_scores)



**Hierachical clustering using agglomerative clusutering**

In [None]:
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import LeaveOneOut

# Load the Iris dataset
iris = load_iris()
X = iris.data

# Instantiate the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(n_clusters=3)

# Perform leave-one-out cross-validation
loo = LeaveOneOut()
validation_scores = []

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_pred = agg_clustering.fit_predict(X_train)
    validation_scores.append(y_pred[test_index][0])

# Print the validation scores
print("Leave-One-Out Cross-Validation Scores for Hierarchical Clustering:")
print(validation_scores)
