In [1]:
from google.colab import drive
import pandas as pd

# Unmount the drive if already mounted
try:
    drive.flush_and_unmount()
except ValueError:
    pass  # Drive was not mounted, continue

# Mount the drive
drive.mount('/content/drive')

# Now try reading the CSV
df = pd.read_csv('/content/drive/MyDrive/Cleaaned_Dataset_22070521115_Aryavardhan_Deshmukh_C.csv')
display(df.head())

Mounted at /content/drive


Unnamed: 0,id,date,country_name,alpha_3_code,country_code,region,region_code,sub_region,sub_region_code,hs_code,commodity,unit,value_qt,value_rs,value_dl
0,0,2015-01-01,Antigua and Barbuda,ATG,28.0,Americas,19,Latin America and the Caribbean,419,8042090,Other Figs Excldng Frsh,Kgs,17.6,79.55,0.13
1,1,2015-01-01,Argentina,ARG,32.0,Americas,19,Latin America and the Caribbean,419,8092900,Other Cherries,Kgs,4.48,15.15,0.02
2,2,2015-01-01,Argentina,ARG,32.0,Americas,19,Latin America and the Caribbean,419,9024030,"Tea Black,Dust In Bulk",Kgs,48.0,51.41,0.08
3,3,2015-01-01,Argentina,ARG,32.0,Americas,19,Latin America and the Caribbean,419,9024060,Tea Black Waste,Kgs,80.0,70.25,0.11
4,4,2015-01-01,Argentina,ARG,32.0,Americas,19,Latin America and the Caribbean,419,9024090,Other Black Tea,Kgs,44.8,31.07,0.05


In [2]:
# Cell 0 — run once
!pip install -q xgboost category_encoders scikit-learn seaborn plotly

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import xgboost as xgb
import category_encoders as ce


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# 3. Feature engineering + cardinality **reduction**

In [3]:
# Cell 3 — create features, reduce high-cardinality categories
# Keep top N countries & commodities, label rest 'Other'
TOP_COUNTRIES = 20
TOP_COMMODITIES = 50

if 'country_name' in df.columns:
    top_countries = df['country_name'].value_counts().nlargest(TOP_COUNTRIES).index
    df['country_top'] = df['country_name'].where(df['country_name'].isin(top_countries), 'Other')

if 'commodity' in df.columns:
    top_comm = df['commodity'].value_counts().nlargest(TOP_COMMODITIES).index
    df['commodity_top'] = df['commodity'].where(df['commodity'].isin(top_comm), 'Other')

# Price per unit (where value_qt != 0)
df['price_per_unit_rs'] = df['value_rs'] / (df['value_qt'].replace(0, np.nan))
df['log_value_rs'] = np.log1p(df['value_rs'])  # useful for modeling skew

# Short descriptive check
df[['country_top','commodity_top','value_qt','value_rs','log_value_rs','price_per_unit_rs']].head()


Unnamed: 0,country_top,commodity_top,value_qt,value_rs,log_value_rs,price_per_unit_rs
0,Other,Other,17.6,79.55,4.388878,4.519886
1,Argentina,Other,4.48,15.15,2.78192,3.381696
2,Argentina,Other,48.0,51.41,3.959097,1.071042
3,Argentina,Other,80.0,70.25,4.266195,0.878125
4,Argentina,Other,44.8,31.07,3.467921,0.693527


# **Level 1: Regression — Predict value_rs**


In [4]:
#4. Build dataset for regression

# Cell 4 — choose features & target
features = ['value_qt','year','month','country_top','commodity_top']
target = 'value_rs'  # or 'log_value_rs' to model logged target

# Ensure 'date' column is datetime type
df['date'] = pd.to_datetime(df['date'])

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

X = df[features].copy()
y = df['value_rs']   # or df['log_value_rs']

# Categorical encoding: use TargetEncoder for high-cardinality categories
cat_feats = [f for f in features if f in ['country_top','commodity_top']]
num_feats = [f for f in features if f not in cat_feats]

# Fill missing numerical with median
for n in num_feats:
    X[n] = X[n].fillna(X[n].median())

# Simple pipeline: target encoding -> scale numeric
encoder = ce.TargetEncoder(cols=cat_feats)
X_enc = encoder.fit_transform(X, y)

scaler = StandardScaler()
X_enc[num_feats] = scaler.fit_transform(X_enc[num_feats])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (529702, 5) Test shape: (132426, 5)


### Cell 5 — Baseline Linear **Regression**

In [None]:
# Cell 5 — Baseline Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print("LinearReg RMSE:", np.sqrt(mean_squared_error(y_test, pred_lr)))
print("LinearReg R2:", r2_score(y_test, pred_lr))

# Random Forest Regressor (baseline)
rfr = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42, n_jobs=-1)
rfr.fit(X_train, y_train)
pred_rf = rfr.predict(X_test)
print("RF RMSE:", np.sqrt(mean_squared_error(y_test, pred_rf)))
print("RF R2:", r2_score(y_test, pred_rf))


LinearReg RMSE: 9385.242100931106
LinearReg R2: 0.04032883120478947


## 6. XGBoost (better performance usually)**bold text**

In [None]:
# Cell 6 — XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=300, max_depth=8, learning_rate=0.05, random_state=42, n_jobs=4)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
pred_xgb = xgb_model.predict(X_test)
print("XGB RMSE:", np.sqrt(mean_squared_error(y_test, pred_xgb)))
print("XGB R2:", r2_score(y_test, pred_xgb))

# 7. Feature importance + partial **interpretation** **bold text**

In [None]:
# Cell 7 — feature importance from RandomForest or XGBoost
importances = pd.Series(rfr.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(importances.head(30))
importances.head(20).plot(kind='barh', figsize=(8,6))
plt.title("Feature importance — RandomForest")
plt.show()


# **Level 2: Classification — categorize value_rs into Low/Medium/High**
============================

In [None]:
# Cell 8 — create 3 classes using quantiles (can change to custom thresholds)
df_clf = df.copy()
df_clf = df_clf.dropna(subset=['value_rs','value_qt'])
df_clf['value_class'] = pd.qcut(df_clf['value_rs'], q=3, labels=['low','medium','high'])

# Build features as before
features = ['value_qt','year','month','country_top','commodity_top']
Xc = df_clf[features].copy()
yc = df_clf['value_class']

# encode & scale
cat_feats = ['country_top','commodity_top']
Xc_enc = encoder.transform(Xc)  # reuse target encoder trained earlier; if broken, fit a new one
Xc_enc[num_feats] = scaler.transform(Xc_enc[num_feats])

Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc_enc, yc, test_size=0.2, random_state=42)


Classification MOdels - Random Forest

In [None]:
# Cell 9 — Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1)
rfc.fit(Xc_train, yc_train)
pred_rfc = rfc.predict(Xc_test)
print(classification_report(yc_test, pred_rfc))
print("Confusion matrix:")
sns.heatmap(confusion_matrix(yc_test, pred_rfc), annot=True, fmt='d')
plt.show()

# Logistic Regression (baseline)
log = LogisticRegression(max_iter=1000)
log.fit(Xc_train, yc_train)
pred_log = log.predict(Xc_test)
print("LogReg report:")
print(classification_report(yc_test, pred_log))


## **HAndle class Imbalance**

In [None]:
# Cell 10 — if classes are imbalanced, try class_weight or resampling
from sklearn.utils import class_weight
cw = class_weight.compute_class_weight('balanced', classes=np.unique(yc_train), y=yc_train)
print("Class weights:", dict(zip(np.unique(yc_train), cw)))

# Example: RandomForest with class_weight (if you switch to sklearn's estimators that accept it)
# rfc_bal = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
# rfc_bal.fit(Xc_train, yc_train)


# **Level 4: Clustering — KMeans on countries / commodities**

11. Prepare aggregation — cluster countries by trade profile

In [None]:
# Cell 11 — aggregate per country (or per commodity)
agg_country = df.groupby('country_name').agg(
    total_value_rs = ('value_rs','sum'),
    mean_value_rs = ('value_rs','mean'),
    median_value_rs = ('value_rs','median'),
    total_qty = ('value_qt','sum'),
    n_shipments = ('id','count')
).reset_index().fillna(0)

# Feature matrix for clustering
cluster_features = ['total_value_rs','mean_value_rs','total_qty','n_shipments']
X_cluster = agg_country[cluster_features].copy()

# Scale
scaler_cl = StandardScaler()
X_cluster_scaled = scaler_cl.fit_transform(X_cluster)


# 12. Choose k with elbow / **silhouette**

In [None]:
# Cell 12 — Elbow plot
inertia = []
K = range(2,10)
for k in K:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_cluster_scaled)
    inertia.append(km.inertia_)

plt.plot(K, inertia, '-o')
plt.xlabel('k')
plt.ylabel('inertia')
plt.title('Elbow Method for k')
plt.show()


13. Fit KMeans and inspect clusters

In [None]:
# Cell 13 — choose k (e.g., k=4)
k = 6
km = KMeans(n_clusters=k, random_state=42, n_init=20)
labels = km.fit_predict(X_cluster_scaled)
agg_country['cluster'] = labels

# Visualize clusters with PCA (2D)
pca = PCA(n_components=2, random_state=42)
pcs = pca.fit_transform(X_cluster_scaled)
agg_country['pc1'] = pcs[:,0]
agg_country['pc2'] = pcs[:,1]

plt.figure(figsize=(8,6))
sns.scatterplot(data=agg_country, x='pc1', y='pc2', hue='cluster', s=80)
plt.title('Country Clusters (PCA reduced)')
plt.legend()
plt.show()

# Inspect cluster centers (in original scale)
centers = scaler_cl.inverse_transform(km.cluster_centers_)
centers_df = pd.DataFrame(centers, columns=cluster_features)
print("Cluster centers (approx):")
print(centers_df)
agg_country.groupby('cluster').agg({'country_name':'count','total_value_rs':'mean','total_qty':'mean'})


## 14. Cluster commodities (similar approach)

In [None]:
# Cell 14 — commodity-level clustering
agg_comm = df.groupby('commodity_top').agg(
    total_value_rs = ('value_rs','sum'),
    mean_value_rs = ('value_rs','mean'),
    total_qty = ('value_qt','sum'),
    n_shipments = ('id','count')
).reset_index().fillna(0)

Xc2 = agg_comm[['total_value_rs','mean_value_rs','total_qty','n_shipments']]
Xc2_s = StandardScaler().fit_transform(Xc2)
km2 = KMeans(n_clusters=5, random_state=42, n_init=20)
agg_comm['cluster'] = km2.fit_predict(Xc2_s)

# show top commodities per cluster
for cl in sorted(agg_comm['cluster'].unique()):
    display(agg_comm[agg_comm['cluster']==cl].sort_values('total_value_rs', ascending=False).head(10))


In [None]:
import joblib

# Save each model
joblib.dump(rfr, 'regression_model_rf.joblib')
joblib.dump(rfc, 'classification_model_rf.joblib')
joblib.dump(km, 'clustering_model_kmeans.joblib')

# Save encoders and scalers
joblib.dump(encoder, 'target_encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("✅ All models saved successfully!")
