In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


# 2. Load and Inspect the Dataset

In [None]:
#Load the dataset from your local machine
data = pd.read_csv('path_to_your_dataset.csv')
# Display the first few rows
print(data.head())

# Display summary statistics
print(data.describe())


# 3. Data Cleaning and Preparation

In [None]:
# Check for missing values
print(data.isnull().sum())

# Check data types
print(data.dtypes)


# 4. Exploratory Data Analysis (EDA)

## 4.1. Descriptive Statistics

In [None]:
# Calculate descriptive statistics
desc_stats = data[['career_kills', 'career_wins', 'career_revives']].describe()
print(desc_stats)


## 4.2. Correlation Analysis

In [None]:
# Pearson Correlation between Career Kills and Career Wins
corr_kills_wins, p_value_kills_wins = pearsonr(data['career_kills'], data['career_wins'])
print(f"Pearson Correlation between Career Kills and Career Wins: {corr_kills_wins:.4f} (p-value: {p_value_kills_wins:.4f})")

# Pearson Correlation between Career Kills and Career Revives
corr_kills_revives, p_value_kills_revives = pearsonr(data['career_kills'], data['career_revives'])
print(f"Pearson Correlation between Career Kills and Career Revives: {corr_kills_revives:.4f} (p-value: {p_value_kills_revives:.4f})")

# Pearson Correlation between Career Wins and Career Revives
corr_wins_revives, p_value_wins_revives = pearsonr(data['career_wins'], data['career_revives'])
print(f"Pearson Correlation between Career Wins and Career Revives: {corr_wins_revives:.4f} (p-value: {p_value_wins_revives:.4f})")


# 5. Regression Analysis

## 5.1. Simple Linear Regression

In [None]:
# Define independent and dependent variables
X = data['career_kills']
y = data['career_wins']

# Add a constant to the model (intercept)
X_with_const = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X_with_const).fit()

# Print the regression results
print(model.summary())


# 6. Assumption Verification

## 6.2. Normality of Residuals

In [None]:
# Obtain residuals
residuals = model.resid

# Calculate Shapiro-Wilk test for normality
from scipy.stats import shapiro
shapiro_stat, shapiro_p = shapiro(residuals)
print(f"Shapiro-Wilk Test: Statistic={shapiro_stat:.4f}, p-value={shapiro_p:.4f}")


## 6.3. Homoscedasticity

In [None]:
from statsmodels.stats.diagnostic import het_breuschpagan

# Perform Breusch-Pagan test
bp_test = het_breuschpagan(residuals, X_with_const)
labels = ['LM Statistic', 'LM Test p-value', 'F-Statistic', 'F-Test p-value']
bp_results = dict(zip(labels, bp_test))
print("Breusch-Pagan Test Results:")
for key in labels:
    print(f"{key}: {bp_results[key]:.4f}")


## 6.2. Addressing Assumption Violations
    Normality and Homoscedasticity Issues:
    Effect: These violations suggest that while the relationship is strong, the model may not fully adhere to regression assumptions, potentially impacting the precision of estimates.
    Potential Remedies:
    Variable Transformation:

Apply logarithmic or square root transformations to Career Kills and/or Career Wins to stabilize variance and achieve normality.

In [None]:
data['log_career_kills'] = np.log1p(data['career_kills'])
data['log_career_wins'] = np.log1p(data['career_wins'])

# Re-run regression with transformed variables
X = data['log_career_kills']
y = data['log_career_wins']
X_with_const = sm.add_constant(X)
model_transformed = sm.OLS(y, X_with_const).fit()
print(model_transformed.summary())


## Robust Regression Techniques:

Utilize regression models less sensitive to assumption violations, such as Huber Regressors or Quantile Regression.

In [None]:
from statsmodels.api import RLM
from statsmodels.tools import add_constant
import statsmodels as sm

# Define X and y
X = add_constant(data['career_kills'])
y = data['career_wins']

# Fit the robust linear model
rlm_model = RLM(y, X, M=sm.robust.norms.HuberT()).fit()
print(rlm_model.summary())


# Non-Parametric Methods:

Employ Spearman's rank correlation to assess monotonic relationships without assuming linearity.

In [None]:
from scipy.stats import spearmanr

spearman_corr, spearman_p = spearmanr(data['career_kills'], data['career_wins'])
print(f"Spearman Correlation: {spearman_corr:.4f}, p-value: {spearman_p:.4f}")


## Incorporate Additional Predictors:

Career Revives: Include Career Revives as an additional predictor in a multiple regression model to control for supportive actions.

In [None]:
import statsmodels.api as sm
from statsmodels.tools import add_constant

# Define X and y
X_multi = data[['career_kills', 'career_revives']]
X_multi = add_constant(X_multi)
y = data['career_wins']

# Fit the ordinary least squares model
model_multi = sm.OLS(y, X_multi).fit()
print(model_multi.summary())


# Explore Non-Linear Relationships:

Polynomial Regression: Investigate if adding quadratic or cubic terms improves model fit

In [None]:
import statsmodels.api as sm
from statsmodels.tools import add_constant

# Create a quadratic term for 'career_kills'
data['career_kills_sq'] = data['career_kills'] ** 2

# Define X and y for polynomial regression
X_poly = data[['career_kills', 'career_kills_sq']]
X_poly = add_constant(X_poly)  # Add a constant term for the intercept
y = data['career_wins']

# Fit the ordinary least squares model
model_poly = sm.OLS(y, X_poly).fit()
print(model_poly.summary())


# Alternative Correlation Measures:

Kendall's Tau: Assess the relationship using Kendall's Tau as another non-parametric measure.

In [None]:
from scipy.stats import kendalltau

tau, p_value_tau = kendalltau(data['career_kills'], data['career_wins'])
print(f"Kendall's Tau: {tau:.4f}, p-value: {p_value_tau:.4f}")


# 6.1. Scatter Plot with Regression Line
Purpose: Visualize the relationship between Career Kills and Career Wins.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset from your local machine
# Make sure to replace the path with the correct one
# Scatter Plot with Regression Line
plt.figure(figsize=(10, 6))
sns.scatterplot(x='career_kills', y='career_wins', data=data, alpha=0.5)
sns.regplot(x='career_kills', y='career_wins', data=data, scatter=False, color='red')
plt.title('Career Kills vs Career Wins')
plt.xlabel('Career Kills')
plt.ylabel('Career Wins')
plt.tight_layout()
plt.savefig('scatter_regression.png')  # Save the plot as an image
plt.show()


The scatter plot illustrates a strong positive relationship between Career Kills and Career Wins, with a regression line indicating that higher kill counts are associated with increased wins.

# 6.2. Residual Plot
Purpose: Assess the assumptions of linearity and homoscedasticity.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Fit the OLS model
X = sm.add_constant(data['career_kills'])
y = data['career_wins']
model = sm.OLS(y, X).fit()

# Calculate residuals
residuals = model.resid

# Residual Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['career_kills'], y=residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Career Kills')
plt.xlabel('Career Kills')
plt.ylabel('Residuals')
plt.tight_layout()
plt.savefig('residual_plot.png')  # Save the plot as an image
plt.show()


Description for Report:
The residual plot shows the residuals scattered around the horizontal axis without any apparent pattern, suggesting linearity. However, the spread of residuals increases with higher kill counts, indicating heteroscedasticity.

# 6.3. Q-Q Plot
Purpose: Assess the normality of residuals.

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm

# Q-Q Plot
sm.qqplot(residuals, line='45', fit=True)
plt.title('Q-Q Plot of Residuals')
plt.tight_layout()
plt.savefig('qq_plot.png')  # Save the plot as an image
plt.show()


Description for Report:
The Q-Q plot of residuals deviates significantly from the reference line, indicating that the residuals are not normally distributed.

# 6.4. Correlation Heatmap
Purpose: Visualize the correlations between variables.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation Matrix
corr_matrix = data[['career_kills', 'career_wins']].corr()

# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".4f")
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')  # Save the plot as an image
plt.show()


Description for Report:
The heatmap displays strong positive correlations between Career Kills and Career Wins (0.7997), Career Wins and Career Revives (0.7029), and a moderate correlation between Career Kills and Career Revives (0.6111).

# 6.5. Actual vs. Predicted Career Wins Plot
Purpose: Visualize the accuracy of the regression model.

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# Define independent variables for multiple regression
X_multi = data[['career_kills', 'career_revives']]
X_multi = sm.add_constant(X_multi)
y = data['career_wins']

# Fit the multiple regression model
model_multi = sm.OLS(y, X_multi).fit()

# Predicted vs Actual
predictions = model_multi.predict(X_multi)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y, y=predictions, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.title('Actual vs Predicted Career Wins')
plt.xlabel('Actual Career Wins')
plt.ylabel('Predicted Career Wins')
plt.tight_layout()
plt.savefig('actual_vs_predicted.png')  # Save the plot as an image
plt.show()


Description for Report:
The scatter plot of Actual vs Predicted Career Wins shows a strong alignment along the diagonal line, indicating that the multiple regression model accurately predicts Career Wins based on Career Kills and Career Revives.

