### aiohttp to get player links

In [1]:
import time
import aiohttp
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from bloom_filter import BloomFilter
bloombloom = BloomFilter(max_elements=100000000, error_rate=0.1)
import lxml.html

In [2]:
base = "https://sofifa.com/players?offset="
urls = []
for i in range(0, 20040, 60):
    url = base + str(i)
    urls.append(url)

In [4]:
players = []
links = []
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text(encoding="utf-8")

In [5]:
async def parser(html):
    doc = lxml.html.fromstring(html)
    out = listing()
    for path in out:
        base = "https://sofifa.com"
        link = doc.xpath(path)[0]
        ver = link.split(sep="/")[4][0:2]
        if ver != "22":
            continue
        p_url = base + link
        if bloombloom.__contains__(p_url):
            #print(f"This url is duplicated: {p_url}")
            continue
        links.append(p_url)
        bloombloom.add(p_url)

In [6]:
async def download(url):
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url)
        await parser(html)

In [17]:
url2 = "https://sofifa.com/players?offset="
urls2 = []
for offset in range(0, 20060, 60):
    urls2.append(url2 + str(offset))


'https://sofifa.com/players?offset=15000'

In [20]:
import requests

In [28]:
t7 = time.time()
#tot_url = urls2[1]
#tot_url

for url in urls2[1:10]:
    requests.get(url)
print(f"Time taken: {time.time() - t7}")


Time taken: 4.227078676223755


In [10]:
len(links)

14286

In [7]:
import nest_asyncio
nest_asyncio.apply()

In [8]:
##
print("#" * 50)
t1 = time.time()
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(download(url)) for url in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
    
t2 = time.time() # 结束时间
print('使用aiohttp，总共耗时：%s' % (t2 - t1))
print('#' * 50)


##################################################
使用aiohttp，总共耗时：49.58137083053589
##################################################


In [None]:
p

### Read data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np
import joblib
%matplotlib inline

In [2]:
df = pd.read_csv("fifa.csv")
print(f"The data size is: {df.shape}")
df.head()

The data size is: (10023, 51)


Unnamed: 0,name,first_name,last_name,country,age,overall,potential,club,best_position,value,...,vision,penalties,composure,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,reflexes
0,Rúben Daniel Fonseca Macedo,Rúben,Macedo,Portugal,25,65,68,Clube Sport Marítimo,RW,950000,...,62,68,70,32,25,21,6,11,14,13
1,Naif Almas,Naif,Almas,Saudi Arabia,21,57,69,Al Fayha,CB,375000,...,35,32,37,57,57,54,12,15,10,12
2,Rakan Al Shamlan,Rakan,Shamlan,Saudi Arabia,22,59,68,Al Batin,LW,500000,...,51,46,57,38,43,45,13,6,7,14
3,Erick Wiemberg,Erick,Wiemberg,Chile,27,69,70,Unión La Calera,LB,1500000,...,57,50,65,58,66,64,12,13,7,9
4,Nicolás Forastiero,Nicolás,Forastiero,Argentina,22,59,69,Argentinos Juniors,GK,450000,...,30,11,31,6,10,12,56,64,64,61


In [60]:
all_columns = df.columns.to_list()
target = "total_stats"
drop_feats = all_columns[0:4] + [target]
# initital split of data
train_df, test_df = train_test_split(df, test_size=0.40, random_state=123)
X_train, y_train = train_df.drop(columns=drop_feats), train_df[target]
X_test, y_test = test_df.drop(columns=drop_feats), test_df[target]

In [61]:
# some encoding stuffs
categorical_feats = ["club", "best_position", "preferred_foot", "work_rate"]
ordinal_feats = ["weak_foot", "skill_move"]
numerical_feats = list(set(all_columns) - set(categorical_feats)
                       - set(ordinal_feats) - set(drop_feats))

# some transformations
numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False))
ct = make_column_transformer(
    (numeric_transformer, numerical_feats),
    (categorical_transformer, categorical_feats)
)
ct.fit(X_train)

In [62]:
# get transformed feature names
ohe_columns = list(
    ct.named_transformers_["pipeline-2"]
    .named_steps["onehotencoder"]
    .get_feature_names_out()
)
new_columns = numerical_feats + ohe_columns
train_df_transformed = pd.DataFrame(
    ct.transform(X_train), columns=new_columns
)
train_df_transformed.head()

Unnamed: 0,wage,weight,balance,standing_tackle,defensive_awareness,value,handling,short_passing,penalties,interceptions,...,preferred_foot_Right,work_rate_High/ High,work_rate_High/ Low,work_rate_High/ Medium,work_rate_Low/ High,work_rate_Low/ Low,work_rate_Low/ Medium,work_rate_Medium/ High,work_rate_Medium/ Low,work_rate_Medium/ Medium
0,-0.08847,-0.293756,0.773076,0.130359,0.060633,-0.15674,-0.530393,0.592894,1.209173,0.099936,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.343621,-0.293756,0.345961,-0.823868,-0.431783,-0.329289,-0.530393,-0.755698,0.256647,-1.056134,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.254422,-1.419271,0.986634,0.941453,0.651532,-0.15674,-0.079077,0.862613,-1.013388,0.726141,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.358142,1.816585,-0.721827,0.368916,0.454565,-0.326244,-0.336972,-1.295135,-1.076889,0.388954,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.254422,1.394517,-1.148942,0.893741,1.045464,-0.24809,-0.014604,-0.48598,-0.37837,0.918819,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [63]:
pipe_ridge = make_pipeline(ct, Ridge())
scores = pd.DataFrame(cross_validate(pipe_ridge, X_train, y_train, return_train_score=True))
scores

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.100006,0.014,0.999687,0.999737
1,0.099001,0.015999,0.999673,0.99974
2,0.094998,0.013001,0.999626,0.999749
3,0.096,0.017,0.999619,0.999749
4,0.101,0.015006,0.999665,0.999741


In [None]:
# examine coefficients learned by the model
pipe_ridge.fit(X_train, y_train)
data = {
    "coefficient": pipe_ridge.named_steps["ridge"].coef_.tolist(),
    "magnitude": np.absolute(pipe_ridge.named_steps["ridge"].coef_.tolist()),
}
coef_df = pd.DataFrame(data, index=train_df_transformed.columns).sort_values(
    "magnitude", ascending=False
)
coef_df

Unnamed: 0,coefficient,magnitude
finishing,25.242782,25.242782
dribbling,22.096167,22.096167
standing_tackle,20.724095,20.724095
interceptions,20.717783,20.717783
long_shots,20.424262,20.424262
...,...,...
club_Harrogate Town,-0.009848,0.009848
club_Sheffield United,0.008594,0.008594
club_Cardiff City,-0.008298,0.008298
club_Metropolitanos de Caracas FC,0.004340,0.004340


In [130]:
model = Ridge(random_state=123).fit(X_train.drop(columns=["club", "best_position", 
                                                          "preferred_foot", "work_rate"]).values,
                                    y_train.values)

predicted_total_stats = model.predict(X_test.drop(columns=["club", "best_position", 
                                                          "preferred_foot", "work_rate"]).values)

mae = mean_absolute_error(predicted_total_stats, y_test)
print(f"MAE = {mae:.2f} points of total stats")

MAE = 4.14 points of total stats


In [131]:
## save model
with open("web_api/total_stats_predictor.joblib", "wb") as f:
    joblib.dump(model, f)
with open('web_application/total_stats_predictor.joblib', 'wb') as f:
    joblib.dump(model, f)

#### prediction function

In [141]:
features = list(set(X_train.columns.to_list()) - set(["club", "best_position", 
                                                     "preferred_foot", "work_rate"]))
inp_dict = {}
for feat in features:
    if feat == "value" or feat == "wage":
        pair = (feat, np.random.randint(1e6, 1e7))
        inp_dict[pair[0]] = pair[1]
    else:
        pair = (feat, np.random.randint(70, 99))
        inp_dict[pair[0]] = pair[1]

In [142]:
def return_prediction(model, input_json):
    input_data = [[input_json[k] for k in input_json.keys()]]
    prediction = model.predict(input_data)[0]

    return prediction

In [143]:
return_prediction(model, inp_dict)

-1510679.0417916523

### New section