In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    r2_score, 
    mean_squared_error,
    mean_absolute_error
)


In [4]:
data = pd.read_csv('sampled_tbc_training_data.csv') 

In [None]:
data.describe()

Unnamed: 0,Customer_ID,recency,frequency,value,age,units,Units_BLT,Units_FRM,Units_IND,Units_MBT,...,INITTOENT,INTWMT,INTWT3,IPIRFSTRK,ISIGNPRO,ITADMILLNM,ITADSEPLUS,ITBCCORE,ITOYODRVN,OREXCEPT
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,30608260.0,48.858,87.221,480.99559,287.642,280.374474,0.393928,1.503271,0.775262,0.132505,...,0.007,0.0,0.013,0.001,0.003,0.0,0.0105,0.0365,0.0005,0.0
std,10380770.0,85.829244,278.567101,575.517781,170.286255,1119.791115,2.815687,8.566144,5.050475,0.974669,...,0.083394,0.0,0.113302,0.031615,0.054704,0.0,0.101956,0.187578,0.022361,0.0
min,10000110.0,1.0,1.0,21.96,1.0,0.038593,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25284430.0,4.0,4.75,275.48563,147.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34079790.0,12.0,17.0,384.518418,274.0,48.749337,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,40021720.0,49.0,60.0,530.049583,430.0,173.129705,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40062320.0,561.0,7240.0,13579.5,605.0,34485.496354,57.0,154.0,97.0,18.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [6]:
data.shape

(2000, 229)

In [7]:
#checking for null values and printing the column names if null values are present
null_columns = data.columns[data.isnull().any()]
if len(null_columns) > 0:
    print("Columns with null values:", null_columns.tolist())

In [8]:
#checking for duplicate rows
duplicate_rows = data[data.duplicated()]
if not duplicate_rows.empty:
    print("Duplicate rows found:")
    print(duplicate_rows)

In [9]:
#checking for duplicate columns
duplicate_columns = data.columns[data.columns.duplicated()]
if len(duplicate_columns) > 0:
    print("Duplicate columns found:", duplicate_columns.tolist())

In [10]:
data.dtypes.value_counts()

float64    203
int64       26
Name: count, dtype: int64

In [11]:
data.describe()

Unnamed: 0,Customer_ID,recency,frequency,value,age,units,Units_BLT,Units_FRM,Units_IND,Units_MBT,...,INITTOENT,INTWMT,INTWT3,IPIRFSTRK,ISIGNPRO,ITADMILLNM,ITADSEPLUS,ITBCCORE,ITOYODRVN,OREXCEPT
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,30608260.0,48.858,87.221,480.99559,287.642,280.374474,0.393928,1.503271,0.775262,0.132505,...,0.007,0.0,0.013,0.001,0.003,0.0,0.0105,0.0365,0.0005,0.0
std,10380770.0,85.829244,278.567101,575.517781,170.286255,1119.791115,2.815687,8.566144,5.050475,0.974669,...,0.083394,0.0,0.113302,0.031615,0.054704,0.0,0.101956,0.187578,0.022361,0.0
min,10000110.0,1.0,1.0,21.96,1.0,0.038593,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25284430.0,4.0,4.75,275.48563,147.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34079790.0,12.0,17.0,384.518418,274.0,48.749337,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,40021720.0,49.0,60.0,530.049583,430.0,173.129705,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40062320.0,561.0,7240.0,13579.5,605.0,34485.496354,57.0,154.0,97.0,18.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [12]:
x=data.drop('sales',axis=1)
y=data['sales']

In [13]:
#find columns which has min as 0 or 0.0 and max as 1 or 1.0 and if they have only 2 unique values
binary_columns = [col for col in x.columns if (x[col].min() in [0, 0.0]) and (x[col].max() in [1, 1.0]) and (x[col].nunique() == 2)]
print("Binary columns:", binary_columns)

Binary columns: ['Units_SAMSON', 'churn', 'Channel_CD', 'Channel_FS', 'Channel_NR', 'Channel_NUC', 'IsProgram', 'BO_GROUP', 'CBI_GROUP', 'CDGTREAD', 'CDGVTP', 'ICOOPMED', 'IFLEETDLR', 'IHANKONE', 'IMASTALLI', 'INEXNTLVL', 'INITTOENT', 'INTWT3', 'IPIRFSTRK', 'ISIGNPRO', 'ITADSEPLUS', 'ITBCCORE', 'ITOYODRVN']


In [14]:
len(binary_columns)

23

In [15]:
#convert binary columns to category
for col in binary_columns:
    x[col] = x[col].astype('category')

In [16]:
#seperate numerical and categorical columns
numerical_cols = x.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = x.select_dtypes(include=['category', 'object']).columns
print("Numerical columns:", numerical_cols.tolist())
print("Categorical columns:", categorical_cols.tolist())

Numerical columns: ['Customer_ID', 'recency', 'frequency', 'value', 'age', 'units', 'Units_BLT', 'Units_FRM', 'Units_IND', 'Units_MBT', 'Units_MRT', 'Units_OTR', 'Units_PASS', 'Units_PSUV', 'Units_RLT', 'Units_SPC', 'Units_TRLR', 'Units_-', 'Units_Customer-Billed', 'Units_National Acct', 'Units_ADVANCE', 'Units_BFGOODRICH', 'Units_BKT', 'Units_BRIDGESTONE', 'Units_CARLISLE', 'Units_CONTINENTAL', 'Units_COOPER', 'Units_CORDOVAN', 'Units_CROSSWIND', 'Units_DEESTONE', 'Units_DELTA', 'Units_DORAL', 'Units_DOUBLE COIN', 'Units_DURATURN', 'Units_DURO', 'Units_EL DORADO', 'Units_FIRESTONE', 'Units_GENERAL', 'Units_GLADIATOR', 'Units_GOODYEAR', 'Units_HANKOOK', 'Units_HARVEST KING', 'Units_LAUFENN', 'Units_MALHOTRA', 'Units_MASTERCRAFT', 'Units_MAXXIS', 'Units_MICHELIN', 'Units_MULTI MILE', 'Units_NANCO', 'Units_NANKANG', 'Units_NATIONAL', 'Units_NEXEN', 'Units_NITTO', 'Units_NORTECH', 'Units_PIRELLI', 'Units_POWER KING', 'Units_PRINX', 'Units_SAILUN', 'Units_SAXON', 'Units_SOLIDEAL', 'Units_S

In [17]:
len(numerical_cols)

205

In [1]:
#remove the columns which has more than 0.75 correlation with each other in numerical columns using simple code
corr_matrix = x[numerical_cols].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
x = x.drop(columns=to_drop, axis=1)
print("Dropped columns due to high correlation:", to_drop)


NameError: name 'x' is not defined

In [None]:
#update numerical_cols after dropping the columns
numerical_cols = x.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = x.select_dtypes(include=['category', 'object']).columns

In [None]:
len(to_drop)

In [None]:
#print the final shape of x
print("Final shape of x:", x.shape)

In [None]:
#scale the numerical columns
scaler = StandardScaler()
x[numerical_cols] = scaler.fit_transform(x[numerical_cols])

In [None]:
#test train split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
# Generate predictions on test set
y_pred = model.predict(x_test)


In [None]:
print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)

# Training metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n   Training Set:")
print(f"   RMSE: {train_rmse:,.2f}")
print(f"   MAE:  {train_mae:,.2f}")
print(f"   R² Score: {train_r2:.4f}")

In [None]:
print("\n Overfitting Check:")
if train_r2 - test_r2 > 0.1:
    print(" WARNING: Possible overfitting detected!")
    print(f" raining R² is {train_r2 - test_r2:.4f} higher than test R²")
else:
    print("Model generalizes well!")
    print(f" Training R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")

In [None]:
#top 10 important features
feature_importances = pd.Series(model.coef_, index=x.columns)
top_10_features = feature_importances.abs().sort_values(ascending=False).head(10)
print("\n Top 10 Important Features:")
print(top_10_features)

In [None]:
#scatter plot between actual and predicted values with different colors
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_train_pred, color='blue', label='Train Predictions')
plt.scatter(y_test, y_test_pred, color='orange', label='Test Predictions')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid()
plt.show()
