In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [14]:

df = pd.read_csv('sample_data/train.csv')

In [15]:
df.shape

(9800, 18)

In [16]:
# Display data types
print(df.info())

# Display missing values
print(df.isnull().sum())

# Display summary statistics for numerical columns
display(df.describe())

# Display summary statistics for categorical columns
display(df.describe(include='object'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9800 non-null   int64  
 1   Order ID       9800 non-null   object 
 2   Order Date     9800 non-null   object 
 3   Ship Date      9800 non-null   object 
 4   Ship Mode      9800 non-null   object 
 5   Customer ID    9800 non-null   object 
 6   Customer Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal Code    9789 non-null   float64
 12  Region         9800 non-null   object 
 13  Product ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub-Category   9800 non-null   object 
 16  Product Name   9800 non-null   object 
 17  Sales          9800 non-null   float64
dtypes: float

Unnamed: 0,Row ID,Postal Code,Sales
count,9800.0,9789.0,9800.0
mean,4900.5,55273.322403,230.769059
std,2829.160653,32041.223413,626.651875
min,1.0,1040.0,0.444
25%,2450.75,23223.0,17.248
50%,4900.5,58103.0,54.49
75%,7350.25,90008.0,210.605
max,9800.0,99301.0,22638.48


Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Region,Product ID,Category,Sub-Category,Product Name
count,9800,9800,9800,9800,9800,9800,9800,9800,9800,9800,9800,9800,9800,9800,9800
unique,4922,1230,1326,4,793,793,3,1,529,49,4,1861,3,17,1849
top,CA-2018-100111,05/09/2017,26/09/2018,Standard Class,WB-21850,William Brown,Consumer,United States,New York City,California,West,OFF-PA-10001970,Office Supplies,Binders,Staple envelope
freq,14,38,34,5859,35,35,5101,9800,891,1946,3140,19,5909,1492,47


In [17]:
# 1. Identify the target variable
target = 'Sales'
y = df[target]

# 2. Drop columns not suitable for direct use as features

columns_to_drop = ['Row ID', 'Order ID', 'Customer ID', 'Product ID', 'Order Date', 'Ship Date', target]
df_processed = df.drop(columns=columns_to_drop, axis=1)

# 3. Handle missing values in 'Postal Code'

if df_processed['Postal Code'].isnull().sum() > 0:
    most_frequent_postal_code = df_processed['Postal Code'].mode()[0]
    df_processed['Postal Code'].fillna(most_frequent_postal_code, inplace=True)

# 4. Identify categorical columns
categorical_cols = df_processed.select_dtypes(include='object').columns.tolist()

# 5. Apply one-hot encoding to the identified categorical columns
X = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)

# 6. Separate the features (X) from the target variable (y)


print("Processed DataFrame head:")
display(X.head())
print("\nTarget variable head:")
display(y.head())
print("\nMissing values in processed features:")
print(X.isnull().sum().sum())

Processed DataFrame head:


Unnamed: 0,Postal Code,Ship Mode_Same Day,Ship Mode_Second Class,Ship Mode_Standard Class,Customer Name_Aaron Hawkins,Customer Name_Aaron Smayling,Customer Name_Adam Bellavance,Customer Name_Adam Hart,Customer Name_Adam Shillingsburg,Customer Name_Adrian Barton,...,Product Name_Zebra ZM400 Thermal Label Printer,Product Name_Zebra Zazzle Fluorescent Highlighters,Product Name_Zipper Ring Binder Pockets,Product Name_i.Sound Portable Power - 8000 mAh,Product Name_iHome FM Clock Radio with Lightning Dock,"Product Name_iKross Bluetooth Portable Keyboard + Cell Phone Stand Holder + Brush for Apple iPhone 5S 5C 5, 4S 4",Product Name_iOttie HLCRIO102 Car Mount,Product Name_iOttie XL Car Mount,Product Name_invisibleSHIELD by ZAGG Smudge-Free Screen Protector,Product Name_netTALK DUO VoIP Telephone Service
0,42420.0,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,42420.0,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,90036.0,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,33311.0,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,33311.0,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False



Target variable head:


Unnamed: 0,Sales
0,261.96
1,731.94
2,14.62
3,957.5775
4,22.368



Missing values in processed features:
0


In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (7840, 3243)
Shape of X_test: (1960, 3243)
Shape of y_train: (7840,)
Shape of y_test: (1960,)


In [10]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the testing features
y_pred = model.predict(X_test)

# Calculate and print Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate and print Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate and print R-squared score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 164.51889292220602
Mean Squared Error (MSE): 461109.8359313903
R-squared (R2): 0.31012870643060886


In [20]:
y_pred = model.predict(X_test)