<a href="https://colab.research.google.com/github/Aartizikre150/DAB-303/blob/main/DAB303_Group_Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U scikit-learn
!pip install category_encoders
!pip install matplotlib seaborn

In [None]:
# import the libraries
import pandas as pd
import numpy as np

# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the data
data = pd.read_csv('/content/drive/MyDrive/DAB303/Project1/E-Commerce Churn Data.csv')

In [None]:
# Get the dimension of the data
print(data.shape)

In [None]:
# Get the information of the data
data.info()

In [None]:
# Print the first 5 records of the dataset
data.head()

In [None]:
# Data description
print(data.describe(include='all'))

In [None]:
# Print count of NA value in columns.
null_counts = data.isnull().sum()
print(null_counts)

In [None]:
# Replace "Mobile Phone" with "Phone" in the 'PreferredLoginDevice' column
data['PreferredLoginDevice'] = data['PreferredLoginDevice'].replace('Mobile Phone', 'Phone')

# Replace "Cash on Delivery" with "COD" in the 'PreferredPaymentMode' column
data['PreferredPaymentMode'] = data['PreferredPaymentMode'].replace('Cash on Delivery', 'COD')
data['PreferredPaymentMode'] = data['PreferredPaymentMode'].replace('Credit Card', 'CC')

# Replace "Mobile Phone" with "Phone" in the 'PreferedOrderCat' column
data['PreferedOrderCat'] = data['PreferedOrderCat'].replace('Mobile Phone', 'Phone')

In [None]:
# Categorical to numerica conversion
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'MaritalStatus'])

# Fit and transform the encoder on the DataFrame
data_encoded = encoder.fit_transform(data)
data_encoded.head()

In [None]:
# Create an empty DataFrame to store missing values and corresponding rows
missing_data_encoded = pd.DataFrame(columns=data_encoded.columns)

# Specify the columns you want to check for missing values
columns_to_check = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder']

# Find rows with missing values in the specified columns
missing_rows = data_encoded[data_encoded[columns_to_check].isna().any(axis=1)]

# Append the missing rows to the missing_data_encoded DataFrame
missing_data_encoded = pd.concat([missing_data_encoded, missing_rows], ignore_index=True)

# Remove rows with missing values in the specified columns from data_encoded
data_encoded.dropna(subset=columns_to_check, inplace=True)

# Reset the index for both dataframes
data_encoded.reset_index(drop=True, inplace=True)
missing_data_encoded.reset_index(drop=True, inplace=True)

# Make the data types of missing_data_encoded match data_encoded
missing_data_encoded = missing_data_encoded.astype(data_encoded.dtypes)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Loop through each target variable
target_variables = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder']
for target_variable in target_variables:
    # Step 1: Split data_encoded into features and target
    X = data_encoded.drop(columns=[target_variable])  # Features
    y = data_encoded[target_variable]  # Target variable

    # Step 2: Create an imputer to handle missing values for both features and target
    feature_imputer = SimpleImputer(strategy='mean')
    target_imputer = SimpleImputer(strategy='mean')

    # Fit the imputers on your feature matrix X and target variable y
    feature_imputer.fit(X)
    target_imputer.fit(y.values.reshape(-1, 1))  # Reshape y to be a 2D array

    # Transform X to replace missing values with the mean
    X_imputed = feature_imputer.transform(X)

    # Transform y to replace missing values with the mean
    y_imputed = target_imputer.transform(y.values.reshape(-1, 1))

    # Flatten y_imputed back to a 1D array
    y_imputed = y_imputed.flatten()

    # Step 3: Create and train a RandomForestRegressor model
    model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust the number of trees as needed
    model.fit(X_imputed, y_imputed)

    # Step 4: Predict missing values in missing_data_encoded
    missing_data_features = missing_data_encoded.drop(columns=[target_variable])  # Features for missing data
    missing_data_encoded[target_variable] = model.predict(feature_imputer.transform(missing_data_features))

# Drop rows with NaN values
data_encoded.dropna(inplace=True)

# Concatenate the dataframes
data_encoded = pd.concat([data_encoded, missing_data_encoded]).sort_values(by='CustomerID').reset_index(drop=True)


In [None]:

data_encoded

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'df' with your data
# If your DataFrame contains non-numeric columns, you may need to encode or exclude them for the heatmap
# For this example, we'll include all columns

# Calculate the correlation matrix
correlation_matrix = data_encoded.corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of DataFrame Columns')
plt.show()


In [None]:
# The distribution of churn (Yes/No). This will help you understand the churn rate in your dataset.

# Count the number of customers in each churn category
churn_counts = data_encoded['Churn'].value_counts()

# Create a bar chart
plt.bar(churn_counts.index, churn_counts.values)
plt.xlabel('Churn')
plt.ylabel('Count')
plt.xticks(churn_counts.index, ['No', 'Yes'])
plt.title('Churn Distribution')
plt.show()

In [None]:
# The distribution of customer tenure. This can help you identify common tenure ranges.

# Create a histogram
plt.hist(data_encoded['Tenure'], bins=10, edgecolor='k')
plt.xlabel('Tenure')
plt.ylabel('Frequency')
plt.title('Customer Tenure Distribution')
plt.show()

In [None]:
# Explore the relationship between the percentage increase in order amount from the last year and the total number of coupons used in the last month

# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(data_encoded['OrderCount'], data_encoded['CashbackAmount'], c='blue', alpha=0.7)
plt.xlabel('Order Count (Last Month)')
plt.ylabel('Cashback Amount (Average)')
plt.title('Scatter Plot: Order Count vs. Cashback Amount')
plt.grid(True)
plt.show()

In [None]:
# Relationship between the total number of coupons used in the last month and the total number of orders placed in the last month for your customers.

# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(data_encoded['CouponUsed'], data_encoded['OrderCount'], c='green', alpha=0.7)
plt.xlabel('Coupon Used (Last Month)')
plt.ylabel('Order Count (Last Month)')
plt.title('Scatter Plot: Coupon Used vs. Order Count')
plt.grid(True)
plt.show()

In [None]:
# Create a count plot to visualize the distribution of complaints among churned and non-churned customers
plt.figure(figsize=(8, 6))
sns.countplot(data=data_encoded, x='Complain', hue='Churn')
plt.title('Complaints vs. Churn')
plt.xlabel('Complaint')
plt.ylabel('Count')
plt.legend(title='Churn', labels=['No Churn', 'Churn'])
plt.show()

In [None]:
# Assuming you have already imported your data and encoded it as 'data_encoded'

# Group the data and create a stacked bar plot
grouped_data = data_encoded.groupby(['PreferredLoginDevice', 'Churn']).size().unstack().plot(kind='bar', stacked=True)

# Set the plot title, x-label, and y-label
plt.title('Churn by PreferredLoginDevice ')
plt.xlabel('PreferredLoginDevice')
plt.ylabel('Count')

# Customize the x-axis labels
plt.xticks([0, 1], ['Phone', 'Computer'])

# Show the plot
plt.show()


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X = data_encoded.drop(columns=["Churn"])
y = data_encoded['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the training and test data
y_train_pred = dt_classifier.predict(X_train)
y_test_pred = dt_classifier.predict(X_test)

# Calculate the accuracy of the model on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)

# Calculate the accuracy of the model on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Decision Tree Training Accuracy:", train_accuracy)
print("Decision Tree Testing Accuracy:", test_accuracy)


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = data_encoded.drop(columns=["Churn"])
y = data_encoded['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Create a Logistic Regression model
logistic_regression_model = LogisticRegression()

# Train the model on the training data
logistic_regression_model.fit(X_train, y_train)

# Calculate and print the training score (mean accuracy)
training_score = logistic_regression_model.score(X_train, y_train)
print("Logistic Regression Training Score:", training_score)

# Calculate and print the testing score (mean accuracy)
testing_score = logistic_regression_model.score(X_test, y_test)
print("Logistic Regression Testing Score:", testing_score)
