In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
from scipy.stats import chi2
from math import floor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn import metrics
%matplotlib inline
sns.set(rc={'axes.edgecolor':'gray', 
            'axes.labelcolor': 'gray', 
            'xtick.color': 'gray', 
            'ytick.color': 'gray', 
            'text.color': 'gray',
            'figure.figsize': (20, 10), 
            'legend.fontsize': 12, 
            'font.size': 12, 
            'legend.title_fontsize': 14, 
            'axes.labelsize': 14,
            'axes.titlesize': 24}, 
        style='white')
color_dict = {"normal":"#A8AA79",
              "fire":"#EF812E",
              "water":"#6991F0",
              "grass":"#7AC852",
              "electric":"#F6D030",
              "ice":"#9AD7D9",
              "fighting":"#C12F27",
              "poison":"#A0429F",
              "ground":"#BCA23B",
              "flying":"#A991F0",
              "psychic":"#F85887",
              "bug":"#A7B822",
              "rock":"#B99F38",
              "ghost":"#6D5947",
              "dark":"#70589A",
              "dragon":"#6B3EE3",
              "steel":"#B6B8D0",
              "fairy":"#FF65D5"
             }

In [None]:
fulldata = pd.read_csv('DataSets/pokemon_comp.csv')
col_against = fulldata.loc[:,fulldata.columns.str.contains('against_')]
df = fulldata.drop(col_against, axis=1)
df.rename(columns={'classfication':'classification'},inplace=True)
# df
df.shape

## 1. Is a PokeMon Legendary?

In [None]:
for index, row in df.iterrows():
    if row['is_legendary'] == 1:
        print(row['name'])

In [None]:
df['capture_rate'] = df['capture_rate'].str.extract('(\d+)').astype(float)
print(df.columns)


In [None]:
legend = df[['is_legendary','hp','attack','defense','sp_attack','sp_defense','speed','base_total','experience_growth','capture_rate','base_happiness','base_egg_steps','base_total','percentage_male']]
# Create a figure and plot the correlation heatmap
plt.figure(figsize=(12, 10))
cor = legend.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
digit_data = df.drop(labels=['abilities','classification','japanese_name','name','type1','type2','percentage_male','generation'],axis=1)
digit_data = digit_data.fillna(digit_data.mean())
# digit_data

## 2. "Shape" and Base Status Relationship

In [None]:
df = fulldata[["name","type1","type2","weight_kg","height_m","hp","attack","defense","sp_attack","sp_defense","speed","base_total","is_legendary"]].copy()
df.info()

In [None]:
missing_weight = df[df["weight_kg"].isna()]
missing_height = df[df["height_m"].isna()]
missing_weight
# missing_height

In [None]:
def dfix(index_no, type2, weight, height):
    df.loc[index_no, "type2"] = type2
    df.loc[index_no, "weight_kg"] = weight
    df.loc[index_no, "height_m"] = height
    return 0

dfix(18, np.nan, 3.5, 0.3)    
dfix(19, np.nan, 18.5, 0.7)   
dfix(25, np.nan, 30, 0.8)     
dfix(26, np.nan, 12, 0.6)     
dfix(27, np.nan, 29.5, 1)   
dfix(36, np.nan, 9.9, 0.6)    
dfix(37, np.nan, 19.9, 1.1)   
dfix(49, np.nan, 0.8, 0.2)    
dfix(50, np.nan, 33.3, 0.7)  
dfix(51, np.nan, 4.2, 0.4)   
dfix(52, np.nan, 32, 1)   
dfix(73, "ground", 20, 0.4)   
dfix(74, "ground", 105, 1)  
dfix(75, "ground", 300, 1.4)  
dfix(87, np.nan, 30, 0.9)     
dfix(88, np.nan, 30, 1.2)     
dfix(102, "psychic", 120, 2)
dfix(104, np.nan, 45, 1)    
dfix(491, np.nan, 2.1, 0.2)   
dfix(554, np.nan, 92.9, 1.3)  
dfix(719, "dark", 490, 6.5)   
dfix(744, np.nan, 25, 0.8)    

df[df["name"].isin(missing_weight["name"])]

In [None]:
type_list = pd.Series(color_dict.keys())
for i in type_list:
    df[i] = 0

for i in range(0,len(df)):
    type1_to_add = df.loc[i, "type1"]
    df.loc[i, type1_to_add] = 1
    type2_to_add = df.loc[i, "type2"]
    if type2_to_add is not np.nan:
        df.loc[i, type2_to_add] = 1

g = sns.scatterplot(x="weight_kg",y="hp",data=df,hue="type1",legend="full",palette=color_dict)
g.set_title("Pokemon by Weight and Base HP")

In [None]:
a = round(df["weight_kg"].corr(df["hp"]),3)
print("The correlation coefficient between height and base HP for all pokemon is "+ str(a))

### 加权

In [None]:
df_type = pd.DataFrame(columns=["type", "corr_coef"])
for i in range(0,len(type_list)):
    value_to_add = df.groupby(type_list[i])[["weight_kg","hp"]].corr().loc[1,"hp"]["weight_kg"]
    df_type.loc[len(df_type.index)] = [type_list[i],value_to_add]
    
df_type.set_index("type", inplace=True)
round(df_type.sort_values(by="corr_coef", ascending=False),3)

In [None]:
for i in range(0, len(type_list)):
    df_type.loc[type_list[i],"type_count"] = (sum(df.loc[:,type_list[i]]))

df_type["type_count"].sort_values(ascending=False)

In [None]:
b = 0
for i in range(0, len(type_list)):
    b = b + df_type.loc[type_list[i],"corr_coef"]* df_type.loc[type_list[i],"type_count"]

b = round(b/sum(df_type.loc[:,"type_count"]),3)
print("The weighted average correlation coefficient of all types of pokemon using weights equal to the number of pokemon with that type is "+ str(b))
print("This exceeds the unweighted average by " + str(round((b-a),3)))

In [None]:
filtered_data = []
for i in range(0, len(type_list)):
    filtered_data.append(df[df[type_list[i]] == 1])
    df_type.loc[type_list[i], "weight_mean"] = (filtered_data[i])["weight_kg"].mean()
    df_type.loc[type_list[i], "weight_stdev"] = (filtered_data[i])["weight_kg"].std()
round(df_type[["weight_mean", "weight_stdev"]].sort_values(by="weight_mean", ascending=False),2)

In [None]:
for i in range(0, len(type_list)):
    df_type.loc[type_list[i], "hp_mean"] = (filtered_data[i])["hp"].mean()
    df_type.loc[type_list[i], "hp_stdev"] = (filtered_data[i])["hp"].std()
round(df_type[["hp_mean", "hp_stdev"]].sort_values(by="hp_mean", ascending=False),2)

In [None]:
g = sns.barplot(y="weight_mean", data=df_type.sort_values(by="weight_mean",ascending=False), x= df_type.sort_values(by="weight_mean",ascending=False).index, palette=color_dict)
g.set_title("Average Pokemon Weight by Type")

In [None]:
g = sns.boxplot(data = df, x = "type1", y = "weight_kg", palette = color_dict, showfliers=False)
g.set_title("Pokemon Weights by Primary Type")

In [None]:
g = sns.lmplot(x="corr_coef",y="type_count",data=df_type, legend=False, height=10, aspect=2, scatter_kws={"s":10*df_type["type_count"], "color":list(color_dict.values())}, line_kws={"linewidth":8,"color":"purple"})
g.fig.suptitle("Correlation Coefficients vs. Number of Pokemon of that Type", fontsize=40, y=1.05)

In [None]:
fig, ((ax0, ax1, ax2, ax3, ax4, ax5),(ax6, ax7, ax8, ax9, ax10, ax11), (ax12, ax13, ax14, ax15, ax16, ax17)) = plt.subplots(3, 6)
g = ((ax0, ax1, ax2, ax3, ax4, ax5),(ax6, ax7, ax8, ax9, ax10, ax11), (ax12, ax13, ax14, ax15, ax16, ax17))
for i in range(0,len(type_list)):
    sns.histplot(data=filtered_data[i], x="hp", ax=g[floor(i/6)][i % 6], color= list(color_dict.values())[i], binrange=[0,255], bins=13).set(title=list(color_dict.keys())[i])
fig.tight_layout()
fig.suptitle("Distribution of Base HP by Type", fontsize=50, y = 1.1)

### 去除异常值

In [None]:
uni_outliers_by_type = pd.DataFrame(columns=["outlier_count"])

for i in range(0, len(type_list)):
    value_to_add = ((df.sort_values(by="weight_kg", ascending=False).head(30))[type_list[i]]).sum()
    uni_outliers_by_type.loc[type_list[i],"outlier_count"] = value_to_add   
    
uni_outliers_by_type.sort_values(by="outlier_count", ascending=False)

In [None]:
def maha(x=None, data=None, cov=None):
    x_minus_mu = x - data.mean()
    cova = np.cov(data.values.T)
    inv_covmat = sp.linalg.inv(cova)
    left = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left, x_minus_mu.T)
    return mahal.diagonal()

In [None]:
for i in range(0, len(type_list)):
    df_x = filtered_data[i][["weight_kg", "hp"]]
    df_x.loc[df_x.index,"mahala"] = maha(x=df_x, data=df_x)
    filtered_data[i] = pd.merge(filtered_data[i], df_x["mahala"], left_index=True, right_index=True)
    filtered_data[i]["p_value"] = 1 - chi2.cdf(filtered_data[i]["mahala"], 1)

In [None]:
bivar_outliers = pd.DataFrame(columns= list(filtered_data[0].columns))
for i in range(0, len(type_list)):
    out_to_add = filtered_data[i][filtered_data[i]["p_value"] < .001]
    bivar_outliers = pd.concat([bivar_outliers, out_to_add])


In [None]:
outlier_dupes = bivar_outliers[bivar_outliers.duplicated(subset=["name"],keep="first")]
bivar_outliers = bivar_outliers[bivar_outliers["type2"].isna()]
bivar_outliers = pd.concat([bivar_outliers, outlier_dupes])
bivar_outliers.sort_values(by="name")

In [None]:
df_no_out = df[~df["name"].isin(bivar_outliers["name"])]
sns.scatterplot(x="weight_kg",y="hp",data=df_no_out,color="#152558").set_title("Pokemon by Weight and Base HP, Outliers Highlighted in Red")
sns.scatterplot(x="weight_kg",y="hp",data=bivar_outliers,color="#F82517")

### 再计算

In [None]:
df_type_no_out = pd.DataFrame(columns=["type", "corr_coef"])
for i in range(0,len(type_list)):
    value_to_add = df_no_out.groupby(type_list[i])[["weight_kg","hp"]].corr().loc[1,"hp"]["weight_kg"]
    df_type_no_out.loc[len(df_type_no_out.index)] = [type_list[i],value_to_add]
    
df_type_no_out.set_index("type", inplace=True)

In [None]:
filtered_data_no_out = []
for i in range(0, len(type_list)):
    df_type_no_out.loc[type_list[i],"type_count"] = (sum(df_no_out.loc[:,type_list[i]]))
    filtered_data_no_out.append(df_no_out[df_no_out[type_list[i]] == 1])
    df_type_no_out.loc[type_list[i], "weight_mean"] = (filtered_data_no_out[i])["weight_kg"].mean()
    df_type_no_out.loc[type_list[i], "weight_stdev"] = (filtered_data_no_out[i])["weight_kg"].std()
    df_type_no_out.loc[type_list[i], "hp_mean"] = (filtered_data_no_out[i])["hp"].mean()
    df_type_no_out.loc[type_list[i], "hp_stdev"] = (filtered_data_no_out[i])["hp"].std()

In [None]:
d = pd.merge(df_type, df_type_no_out, left_index=True, right_index=True)
d.rename({"corr_coef_x":"corr_coef_old","type_count_x":"type_count_old","weight_mean_x":"weight_mean_old","weight_stdev_x":"weight_stdev_old","hp_mean_x":"hp_mean_old","hp_stdev_x":"hp_stdev_old","corr_coef_y":"corr_coef_new","type_count_y":"type_count_new","weight_mean_y":"weight_mean_new","weight_stdev_y":"weight_stdev_new","hp_mean_y":"hp_mean_new","hp_stdev_y":"hp_stdev_new"}, inplace=True, axis=1)

In [None]:
d["corr_coef_Δ"] = d["corr_coef_new"] - d["corr_coef_old"]
d["corr_coef_%_Δ"] = round((d["corr_coef_Δ"]/d["corr_coef_old"]*100),1)
round(d[["corr_coef_new","corr_coef_old","corr_coef_Δ","corr_coef_%_Δ"]].sort_values(by="corr_coef_new", ascending=False),3)

In [None]:
d["type_count_Δ"] = d["type_count_new"] - d["type_count_old"]
d["type_count_%_Δ"] = round((d["type_count_Δ"]/d["type_count_old"]*100),1)
d[["type_count_new","type_count_old","type_count_Δ","type_count_%_Δ"]].sort_values(by="type_count_new", ascending=False)

In [None]:
d["weight_mean_Δ"] = d["weight_mean_new"] - d["weight_mean_old"]
d["weight_mean_%_Δ"] = round((d["weight_mean_Δ"]/d["weight_mean_old"]*100),1)
d["weight_stdev_Δ"] = d["weight_stdev_new"] - d["weight_stdev_old"]
d["weight_stdev_%_Δ"] = round((d["weight_stdev_Δ"]/d["weight_stdev_old"]*100),1)
round(d[["weight_mean_new","weight_mean_old","weight_mean_Δ","weight_mean_%_Δ","weight_stdev_new","weight_stdev_old","weight_stdev_Δ","weight_stdev_%_Δ"]].sort_values(by="weight_mean_new", ascending=False),2)

In [None]:
g = sns.lmplot(x="corr_coef",y="type_count",data=df_type_no_out, legend=False, height=10, aspect=2, scatter_kws={"s":10*df_type_no_out["type_count"], "color":list(color_dict.values())}, line_kws={"linewidth":8,"color":"purple"})
g.fig.suptitle("Correlation Coefficients vs. Number of Pokemon of that Type (Outliers Removed)", fontsize=40, y=1.05)

In [None]:
g = sns.lmplot(x="corr_coef",y="weight_mean",data=df_type_no_out, legend=False, height=10, aspect=2, scatter_kws={"s":10*df_type_no_out["type_count"], "color":list(color_dict.values())}, line_kws={"linewidth":8,"color":"purple"})
g.fig.suptitle("Correlation Coefficients vs. Average Weight of Pokemon of that Type (Outliers Removed)", fontsize=40, y=1.05)

In [None]:
fig, ((ax0, ax1, ax2, ax3, ax4, ax5),(ax6, ax7, ax8, ax9, ax10, ax11), (ax12, ax13, ax14, ax15, ax16, ax17)) = plt.subplots(3, 6)
g = ((ax0, ax1, ax2, ax3, ax4, ax5),(ax6, ax7, ax8, ax9, ax10, ax11), (ax12, ax13, ax14, ax15, ax16, ax17))
for i in range(0,len(type_list)):
    sns.histplot(data=filtered_data_no_out[i], x="hp", ax=g[floor(i/6)][i % 6], color= list(color_dict.values())[i], binrange=[0,255], bins=13).set(title=list(color_dict.keys())[i])
fig.tight_layout()
fig.suptitle("Distribution of Base HP by Type (Outliers Removed)", fontsize=50, y = 1.1)

In [None]:
d["hp_mean_Δ"] = d["hp_mean_new"] - d["hp_mean_old"]
d["hp_mean_%_Δ"] = round((d["hp_mean_Δ"]/d["hp_mean_old"]*100),1)
d["hp_stdev_Δ"] = d["hp_stdev_new"] - d["hp_stdev_old"]
d["hp_stdev_%_Δ"] = round((d["hp_stdev_Δ"]/d["hp_stdev_old"]*100),1)
round(d[["hp_mean_new","hp_mean_old","hp_mean_Δ","hp_mean_%_Δ","hp_stdev_new","hp_stdev_old","hp_stdev_Δ","hp_stdev_%_Δ"]].sort_values(by="hp_mean_new", ascending=False),2)

### 线性回归

In [None]:
X = df_no_out[["weight_kg","normal","fire","water","grass","electric","ice","fighting","poison","ground","flying","psychic","bug","rock","ghost","dark","dragon","steel","fairy"]]
y = df_no_out["hp"]
kf = KFold(n_splits=10, shuffle=True, random_state=135)
cv_scores = cross_val_score(LinearRegression(), X=X, y=y, cv=kf, scoring="r2")
cv_results = cross_validate(LinearRegression(), X=X, y=y, cv=kf, return_estimator=True)
cv_coefs = []
cv_intercepts = []
for model in cv_results["estimator"]:
    cv_coefs.append(model.coef_)
    cv_intercepts.append(model.intercept_)

In [None]:
for i in range(0,len(df)):
    pred_hp_int, pred_hp_coef = 0, 0
    for j in range(0, 10):
        pred_hp_list = []
        pred_hp_int = pred_hp_int + cv_intercepts[j]
        pred_hp_coef = pred_hp_coef + cv_coefs[j][0]*df.loc[i,"weight_kg"]
        for k in range(0, len(type_list)):
            pred_hp_coef = pred_hp_coef + df.loc[i,type_list[k]]*cv_coefs[j][k+1]
        pred_hp_list.append(pred_hp_coef + pred_hp_int)
    df.loc[i,"predicted_hp"] = sum(pred_hp_list)/10

In [None]:
plt.style.use("seaborn-whitegrid")
plt.scatter(df["predicted_hp"], df["hp"], color="blue")
plt.xlabel("predicted HP")
plt.ylabel("actual HP")
plt.plot([0,255],[0,255], color="red",linestyle="dashed")
plt.title("Predicted vs. Actual HP Values")

In [None]:
g = sns.histplot((df["hp"] - df["predicted_hp"]),bins=60, color="blue")
g.set_title("Residuals")

In [None]:
plt.style.use("seaborn-whitegrid")
plt.scatter(df["predicted_hp"], (df["hp"]-df["predicted_hp"]), color="blue")
plt.xlabel("predicted HP")
plt.ylabel("residuals")
plt.title("Predicted HP vs. Residuals")

In [None]:
print("Mean Absolute Error:", round(metrics.mean_absolute_error(df["hp"], df["predicted_hp"]),3))
print("Mean Squared Error:", round(metrics.mean_squared_error(df["hp"], df["predicted_hp"]),3))
print("Root Mean Squared Error:", round(np.sqrt(metrics.mean_squared_error(df["hp"], df["predicted_hp"])),3))
print("R2 Score:", round(metrics.r2_score(df["hp"], df["predicted_hp"]),3))