In [4]:
import pandas as pd  # Used for handling and manipulating structured data in DataFrames.
import numpy as np  # Used for numerical operations, arrays, and handling missing values.

from sklearn.model_selection import train_test_split, GridSearchCV  
# train_test_split: Splits the dataset into training and testing sets.
# GridSearchCV: Performs hyperparameter tuning using cross-validation to find the best parameters.

from sklearn.ensemble import RandomForestClassifier  
# RandomForestClassifier: A robust ensemble learning algorithm used for classification tasks.

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder  
# LabelEncoder: Converts categorical labels into numerical format.
# StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
# OneHotEncoder: Converts categorical variables into binary (one-hot) encoded format for machine learning models.

from sklearn.impute import SimpleImputer  # Handles missing values by replacing them with mean, median, mode, or a constant.
from sklearn.metrics import accuracy_score  # Evaluates classification model accuracy.
from imblearn.over_sampling import SMOTE  # Handles imbalanced datasets by generating synthetic samples for the minority class.
from sklearn.feature_selection import SelectKBest, f_classif  # Selects k best features based on a scoring function.
import xgboost as xgb  # Optimized gradient boosting algorithm for structured/tabular data.
import warnings  # Manages warnings, useful for suppressing unnecessary messages.
import pickle  # Used for saving and loading machine learning models.

In [10]:
warnings.filterwarnings("ignore") #  used to suppress all warnings in Python.

In [20]:
# Load Dataset
file_path = "Updated_Indian_Army_Pistols_Corrected.csv"
df = pd.read_csv(file_path)

In [22]:
df

Unnamed: 0,Pistol Name,Barrel Length,Caliber,Magazine Capacity,Weight,Reliability,Recoil Level,Concealability,Price,Action Type,Sight Type,Military Use,Best Use Case
0,Glock 17,5.8,13,10,0.60,8.8,High,Average,525,Single Action,Laser Sight,No,Training
1,Colt M1911,5.2,13,20,0.89,9.3,Low,Average,944,Double Action,Laser Sight,Yes,Surgical Strike
2,Colt M1911,5.8,12,12,1.18,7.3,High,Good,1262,Semi-Automatic,Laser Sight,Yes,House-to-House Combat
3,Beretta M9,3.7,12,15,1.02,9.4,Low,Excellent,1465,Semi-Automatic,Optical,Yes,VIP Protection
4,Beretta M9,3.9,12,17,0.62,7.5,High,Good,1464,Double Action,Optical,Yes,Surgical Strike
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Sig Sauer P320,5.0,13,10,0.82,7.4,High,Poor,1482,Striker-Fired,Night Vision,No,Surveillance
96,Colt M1911,4.4,11,20,0.82,9.8,High,Poor,601,Semi-Automatic,Optical,No,House-to-House Combat
97,Colt M1911,3.7,11,15,0.82,8.9,High,Good,668,Single Action,Optical,No,VIP Protection
98,Smith & Wesson M&P9,5.2,10,17,0.63,8.1,Low,Poor,1643,Double Action,Optical,Yes,Self-Defense


In [26]:
missing_values = df.isnull().sum()
missing_values 

Pistol Name          0
Barrel Length        0
Caliber              0
Magazine Capacity    0
Weight               0
Reliability          0
Recoil Level         0
Concealability       0
Price                0
Action Type          0
Sight Type           0
Military Use         0
Best Use Case        0
dtype: int64

In [28]:
# Check for Missing Values
print("\nMissing Values in Each Column:\n", df.isnull().sum())

# Statistical Summary of Numeric Columns
print("\nStatistical Summary:\n", df.describe())

# Check for Duplicates
print("\nNumber of Duplicate Rows:", df.duplicated().sum())

# Data Type Summary
print("\nColumn Data Types:\n", df.dtypes)



Missing Values in Each Column:
 Pistol Name          0
Barrel Length        0
Caliber              0
Magazine Capacity    0
Weight               0
Reliability          0
Recoil Level         0
Concealability       0
Price                0
Action Type          0
Sight Type           0
Military Use         0
Best Use Case        0
dtype: int64

Statistical Summary:
        Barrel Length     Caliber  Magazine Capacity      Weight  Reliability  \
count     100.000000  100.000000         100.000000  100.000000   100.000000   
mean        4.816000   11.130000          14.640000    0.889500     8.459000   
std         0.702049    1.454043           3.588759    0.168849     0.927045   
min         3.600000    9.000000          10.000000    0.600000     7.000000   
25%         4.200000   10.000000          12.000000    0.757500     7.575000   
50%         4.900000   11.000000          15.000000    0.860000     8.500000   
75%         5.400000   12.250000          17.000000    1.032500     9.30

In [30]:
# Defibre Essentiasl Column
pistol_name_column = "Pistol Name"
label = "Best Use Case"