In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import warnings 

warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
df = pd.read_csv("pubg.csv")

# Display the first 5 rows
df.head()

### Data Preprocessing

In [None]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())

In [None]:
# Fill missing values only in numeric columns
df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)

In [None]:
# Remove Duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Drop Unnecessary Columns (Id, groupId, matchId)
df.drop(columns=['Id', 'groupId', 'matchId'], inplace=True)

In [None]:
# Encode Categorical Column (matchType)
# Convert matchType to numbers using Label Encoding
encoder = LabelEncoder()
df['matchType'] = encoder.fit_transform(df['matchType'])

### Exploratory Data Analysis (EDA)

In [None]:
# Plot Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Plot Distribution of Kills
sns.histplot(df['kills'], bins=30, kde=True)
plt.title("Kill Distribution")
plt.show()

### Feature Engineering

In [None]:
# Create new meaningful features
df['kill_per_distance'] = df['kills'] / (df['rideDistance'] + df['walkDistance'] + df['swimDistance'] + 1)
df['efficiency_score'] = (df['kills'] + df['assists']) / (df['damageDealt'] + 1)

### Prepare Data for Model Training

In [None]:
# Define X (features) and y (target variable)
X = df.drop(columns=['winPlacePerc'])  # Drop target column
y = df['winPlacePerc']  # Target variable

In [None]:
# Split Data into Training & Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale Features (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Train and Evaluate Models

In [None]:
# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - MSE: {mse_lr}, R2 Score: {r2_lr}")

In [None]:
# Train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - MSE: {mse_rf}, R2 Score: {r2_rf}")

In [None]:
# Train Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)

mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)
print(f"Gradient Boosting - MSE: {mse_gbr}, R2 Score: {r2_gbr}")

In [None]:
# Train XGBoost Regressor
xgb = XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost - MSE: {mse_xgb}, R2 Score: {r2_xgb}")

### Compare Model Performance

In [None]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "Gradient Boosting", "XGBoost"],
    "MSE": [mse_lr, mse_rf, mse_gbr, mse_xgb],
    "R2 Score": [r2_lr, r2_rf, r2_gbr, r2_xgb]
})

print(results)

### Feature Importance (Using XGBoost)

In [None]:
importances = xgb.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features)
plt.title("Feature Importance (XGBoost)")
plt.show()

### Save Best Model for Deployment

In [None]:
## joblib.dump(xgb, "pubg_win_predictor.pkl")  # Save the best model