In [None]:
### import libraries
import os 
import pandas as pd
from tqdm import tqdm
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import lit, monotonically_increasing_id

####  Help function

In [None]:
# For pandas
# Reducing dataframe memory usage :-
def ReduceMemory(df: pd.DataFrame):
    """
    This function reduces the associated dataframe's memory usage.
    It reassigns the data-types of columns according to their min-max values.
    It also displays the dataframe information after memory reduction.
    """;
    
    # Reducing float column memory usage:-
    for col in tqdm(df.iloc[0:2, 1:].select_dtypes('float').columns):
        col_min = np.amin(df[col].dropna());
        col_max = np.amax(df[col].dropna());
        
        if col_min >= np.finfo(np.float16).min and col_max <= np.finfo(np.float16).max: 
            df[col] = df[col].astype(np.float16)
        elif col_min >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max : 
            df[col] = df[col].astype(np.float32)
        else: pass;

    # Reducing integer column memory usage:-
    for col in tqdm(df.iloc[0:2, 1:].select_dtypes('int').columns):
        col_min = df[col].min(); 
        col_max = df[col].max();
        
        if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8);
        elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16);
        elif col_min >= np.iinfo(np.int32).min & col_max <= np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32);
        else: pass;
        
    display(df.info());
    
    return df;

## Read Data

In [None]:
appName = "Project1"
master = "local"


sc = SparkSession.builder.appName(appName).getOrCreate()
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession.builder.getOrCreate()

In [None]:
# fifa data folder should contain all the csv files from Fifa(Kaggle), 2015-2022
# assume that you are working in the same directory as the data folder
full_data_path = os.getcwd() + '/full_data.csv'

if not os.path.exists(full_data_path):
    data_path = os.getcwd() + '/fifadata'
    if os.path.exists(data_path):
        print("Data folder exists")
    else:
        print("Data folder does not exist")
        os.makedirs(data_path)
        print("Sussessfully created data folder")

    csv_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.csv')]
    print(csv_files)
    combined_df = None
    for file in csv_files:
        year = file.split("players_")[1].split(".csv")[0]
        df = spark.read.csv(file, header=True, inferSchema=True)
        df = df.withColumn("year", lit(year)) # this is the unique column 'year'
        if combined_df is None:
            combined_df = df
        else:
            combined_df = combined_df.union(df)
    combined_df = combined_df.withColumn("id", monotonically_increasing_id())

    # Write the concatenated DataFrame to a new CSV file
    output_file = "/Users/dylan/DylanLi/Code_Repo/CMU18763_Projects1/full_data.csv"
    ReduceMemory(combined_df.toPandas()).to_csv(output_file)
else: 
    df = spark.read.csv(full_data_path, header=True, inferSchema=True)
    

[Stage 16:> (5 + 25) / 30][Stage 17:>   (0 + 1) / 1][Stage 18:>   (0 + 1) / 1]

## Using pyspark to read table and write to PostgreSQL

In [None]:
# read multiple csv files into spark dataframe


In [None]:
db_properties={}
db_properties['username']="postgres"
db_properties['password']="010323"
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
db_properties['table']="fifa"
db_properties['driver']="org.postgresql.Driver"

In [None]:
df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()

In [None]:
df_read = sqlContext.read.format("jdbc")\
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()
    
df_read.show()

In [None]:
df_read.printSchema()

## Task II 

### Question 1


In [None]:
df_read.createOrReplaceTempView("df_view")

In [None]:
sqlWay1 = spark.sql("""
SELECT dv.club_name, COUNT(*) AS player_count
FROM   df_view dv
WHERE  dv.year = 22 AND dv.club_contract_valid_until = 2023
GROUP BY dv.club_name
ORDER BY player_count DESC
LIMIT 10;
                    """)

In [None]:
sqlWay1.show()

### Question 2 


In [None]:
sqlWay2 = spark.sql("""
WITH club_counts AS (
    SELECT dv.club_name, dv.year, COUNT(*) AS player_count
    FROM df_view dv
    WHERE dv.age > 27
    GROUP BY dv.club_name, dv.year
),
club_averages AS (
    SELECT cc.club_name, AVG(cc.player_count) AS average_count
    FROM club_counts cc
    GROUP BY cc.club_name
)
SELECT ca.club_name, ca.average_count
FROM club_averages ca
WHERE (
    SELECT COUNT(*) FROM club_averages ca2 WHERE ca2.average_count > ca.average_count
) < 10
ORDER BY ca.average_count DESC;
""")

In [None]:
sqlWay2.show()

### Question 3

In [None]:
sqlWay3 = spark.sql("""
WITH yearly_counts AS (
    SELECT dv.year, dv.nation_position, COUNT(*) AS position_count
    FROM df_view dv
    WHERE dv.nation_position IS NOT NULL
    GROUP BY dv.year, dv.nation_position
),
max_counts AS (
    SELECT yc.year, MAX(yc.position_count) AS max_count
    FROM yearly_counts yc
    GROUP BY yc.year
)
SELECT mc.year, yc.nation_position, mc.max_count
FROM max_counts mc
JOIN yearly_counts yc ON mc.year = yc.year AND mc.max_count = yc.position_count
ORDER BY mc.year;


""")

In [None]:
sqlWay3.show()

## Task 3 nad 4 are still processing.

### Task III 


### EDA

#### Data Preprocessing

##### Drop Useless column

In [None]:
# Drop useless column
useless_columns = ['sofifa_id', 'player_url', 'long_name', 'dob', 'club_loaned_from',
                   'nation_position', 'nation_jersey_number', 'body_type', 'real_face',
                   'player_face_url', 'club_logo_url', 'nation_logo_url', 'nation_flag_url',
                    'goalkeeping_speed', 'player_tags', 'nation_team_id', 'short_name', 'league_name','id', 'club_joined','club_contract_valid_until'] #TODO how to deal with time data

In [None]:
# if in linux use
# df_read = df

In [None]:

new_df = df_read.drop(*useless_columns)

In [None]:
new_df.show(5)

##### Delete columns that include url

In [None]:
from pyspark.sql.functions import col

# Get a list of columns that include URLs
url_columns = [c for c in new_df.columns if 'url' in c]

# Drop the columns that include URLs
new_df = new_df.drop(*url_columns)

##### Drop Columns that Missing Value are more than 50%

In [None]:
from pyspark.sql.functions import col

cols_to_drop = []
for i in new_df.columns:
    missing = new_df.filter(col(i).isNull()).count() / new_df.count() * 100
    if missing > 50:
        print('{} - {}%'.format(i, round(missing)))
        cols_to_drop.append(i)

In [None]:
new_df = new_df.drop(*cols_to_drop)

In [None]:

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, sum as _sum, when

na_counts = new_df.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in new_df.columns])

In [None]:
na_counts.show()

##### drop value after +


In [None]:
from pyspark.sql.functions import split
from pyspark.sql.types import IntegerType

columns1 = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram',
            'lm','lcm','cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb',
            'lb','lcb','cb','rcb','rb']

for col in columns1:
    new_df = new_df.withColumn(col, split(new_df[col], '\+').getItem(0).cast(IntegerType()))

In [None]:
new_df.show(5)

#### Handling Missing Value


In [None]:
new_df = new_df.fillna(0)

In [None]:
na_value = "NA"

In [None]:
string_cols = [c for c, t in new_df.dtypes if t == 'string']

In [None]:

for col in string_cols:
  new_df = new_df.fillna(na_value, subset=[col])

In [None]:
new_df.show()

#### Feature Engineering

In [None]:
from pyspark.sql.functions import split, when, col, array_contains
import itertools

# Split positions into array
split_positions = split(new_df['player_positions'], ', ')  

# Get distinct positions as a list
distinct_positions = list(set(list(itertools.chain(*new_df.select(split_positions.alias('positions')).distinct().rdd.flatMap(lambda x: x).collect()))))

# Create a column for each distinct position
for position in distinct_positions:
  new_df = new_df.withColumn(
    'Position_' + position,
     when(array_contains(split_positions, position), 1).otherwise(0)
  )




In [None]:
new_df = new_df.drop('player_positions')

In [None]:
#Check missing values again
na_counts = new_df.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in new_df.columns])
na_counts.show()

In [None]:
# We are gonna preprocess the preffered_foot using one-hot encoder
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
indexer = StringIndexer(inputCol='preferred_foot', outputCol='indexed_preferred_foot')
encoder = OneHotEncoder(inputCols=['indexed_preferred_foot'], outputCols=['preferred_foot_encoded'])

pipeline = Pipeline(stages=[indexer,encoder])

model = pipeline.fit(new_df)

col_to_drop = ['indexed_preferred_foot','preferred_foot']
data_encoded = model.transform(new_df).drop(*col_to_drop)

In [None]:
# use label encoder for work_rate and player_positions label_encoder
from pyspark.ml.feature import StringIndexer

# Loop over each string column in the DataFrame
for col_name, data_type in data_encoded.dtypes:
    if data_type == 'string':
        # Create a StringIndexer object and fit it to the column
        indexer = StringIndexer(inputCol=col_name, outputCol=col_name + '_indexed')
        model = indexer.fit(data_encoded)
        
        # Transform the column using the fitted indexer
        data_encoded = model.transform(data_encoded).drop(col_name)

In [None]:
data_encoded.show(5)

In [None]:
data_encoded.printSchema()

## Using A Denosing Autoencoder to Imputer

In [None]:
def inspect_columns(df):
    result = pd.DataFrame({
        'unique': df.nunique() == len(df),
        'cardinality': df.nunique(),
        'with_null': df.isna().any(),
        'null_pct': round((df.isnull().sum() / len(df)) * 100, 2),
        'zero_count': (df == 0).sum(),
        'mean': df.mean(),
        'median': df.median(),
        'std_dev': df.std(),
        'min': df.min(),
        'max': df.max(),
        '1st_quantile': df.quantile(0.25),
        '2nd_quantile': df.quantile(0.50),
        '3rd_quantile': df.quantile(0.75),
        '1st_row': df.iloc[0],
        'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
        'last_row': df.iloc[-1],
        'dtype': df.dtypes
    })
    return result

In [None]:
data_encoded.show(5)

In [None]:
pd_df = data_encoded.toPandas()

In [None]:
# output data
pd_df.to_csv('/Users/dylan/DylanLi/Code_Repo/CMU18763_Projects1/pypark_data.csv')

In [None]:
data_noencoded = pd_df.iloc[:,:-6]
data_noencoded

In [None]:
target = 'overall'

In [None]:
pd_df

In [None]:
data_noencoded = ReduceMemory(data_noencoded)

In [None]:
inspect_columns(data_noencoded)

In [None]:
import math
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import torch
import torch.nn.functional as F
import torch.utils.data
from torch import nn

from tqdm import tqdm

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [None]:
# Define masks


# Mask for training -> binomial + base one per row
def random_mask(shape, binomial_p=0.05):
    n, k = shape
    mask = np.ones((n, k))
    # Set minimum one random per row
    mask[(
        np.arange(n),
        np.random.randint(0, k, n)
    )] = 0
    # Add binomial probability as well
    b_mask = np.random.binomial(1, 1-binomial_p, (n, k))
    return mask * b_mask


# Mask for validation - fixed n missing per row
def mask_n_rows(shape, n_missing):
    n, k = shape
    s = np.arange(k)[np.newaxis, :].repeat(n, axis=0).reshape(n, k)
    idx = np.random.rand(n, k).argsort(1)[:,:n_missing]
    col_idx = np.take_along_axis(s, idx, axis=1).ravel()
    row_idx = np.arange(n).repeat(n_missing)
    
    mask = np.ones((n, k))
    mask[(
        row_idx,
        col_idx
    )] = 0
    return mask

In [None]:
target = data_noencoded['overall']
features = data_noencoded.iloc[:, 1:]

In [None]:
data = features.copy()

In [None]:
# Defining the train and validation set

is_missing_bool = data.loc[:,:].isna().sum(axis=1) > 0

# Define subsets of the data with row-wise missing values
X_complete = data.loc[~is_missing_bool, :].values
X_missing = data.loc[is_missing_bool, :].values

# Split data that has no missing to use for eval set
X_train_complete, X_valid = train_test_split(X_complete, random_state=6) # Same as previous

# Build train set from complete and missing data
X_train = np.concatenate([X_train_complete, X_missing], axis=0)

# Mask to show train values that have been imputed
srce_nan_train = np.concatenate([
    np.zeros(X_train_complete.shape),
    data.loc[is_missing_bool, :].isna().astype(np.uint8).values
])

# Feature scaling
scaler = StandardScaler()

scaler.fit(data.loc[:,:].values)

X_train = np.nan_to_num(scaler.transform(X_train), 0.0)
X_valid = scaler.transform(X_valid)

In [None]:
X_train.shape

In [None]:
# Build Model

class MLP(nn.Module):
# Dense layer with layer normalization and mish activation
    def __init__(self, input_size, output_size):
        super().__init__()
        self.dense = nn.Linear(input_size, output_size)
        self.act = nn.Mish()
        self.layernorm = nn.LayerNorm(output_size, eps=1e-6)
        
    def forward(self, x):
        x = self.dense(x)
        x = self.act(x)
        return self.layernorm(x)
    
# Msked autoencoder model
class MaskedAutoencoder(nn.Module):
    def __init__(self, n_columns, emb_dim,
                 units=[512, 512, 512, 512, 512, 128]):
        super().__init__()
        self.n_columns = n_columns

        # Embedding
        self.inp_proj = nn.Linear(1, emb_dim)
        self.mask_proj = nn.Linear(1, emb_dim)
        self.emb_norm = nn.LayerNorm(n_columns * emb_dim, eps=1e-6)
        
        # MLP with skip connection
        self.mlp_layers = nn.ModuleList([])
        for i in range(len(units)):
            if i==0:
                input_size = n_columns * emb_dim
            elif i==1:
                input_size = n_columns * emb_dim + units[0]
            else:
                input_size = units[i-1] + units[i-2]
            output_size = units[i]
            self.mlp_layers.append(
                MLP(input_size=input_size, output_size=output_size)
            )
                
        self.final_dense = nn.Linear(units[-1] + units[-2], self.n_columns)
        
    def forward(self, inputs:torch.Tensor, mask:torch.Tensor):
        # Embeddings
        input_embedding = self.inp_proj(torch.unsqueeze(inputs, 2))
        mask_embedding = self.mask_proj(torch.unsqueeze(1-mask, 2))
        embedding = input_embedding + mask_embedding
        embedding = torch.flatten(embedding, 1)
        x = [self.emb_norm(embedding)]
        
        # MLP
        for i in range(len(self.mlp_layers)):
            if i==0:
                z = self.mlp_layers[i](x[0])
                x.append(z)
            else:
                z = torch.cat((x[-1], x[-2]), 1)
                z = self.mlp_layers[i](z)
                x.append(z)
                
        x = torch.cat((x[-1], x[-2]), 1)
        x = self.final_dense(x)
        
        # Output modification - predict only masked values, otherwise use inputs
        outputs = torch.mul(inputs, mask) + torch.mul(1-mask, x)
        
        return outputs

In [None]:
# Helper validation method
def validate(model, valid_mask, batch_size=4096):
    assert valid_mask.shape == X_valid.shape
    
    n_batches_valid = X_valid.shape[0] // batch_size + 1
    
    model.eval()
    with torch.no_grad():
        ps = []
        for batch in range(n_batches_valid):
            x = torch.tensor(X_valid[batch * batch_size: (batch+1) * batch_size].astype(np.float32)).to(device)
            mask = torch.tensor(valid_mask[batch * batch_size: (batch+1) * batch_size].astype(np.float32)).to(device)
            x_masked = x * mask

            p = model(x_masked, mask).cpu().numpy()
            ps.append(p)

        p = np.vstack(ps)
        mask_bool = (1 - valid_mask).astype(bool)
        rmse = np.sqrt(mean_squared_error(
            scaler.inverse_transform(p)[mask_bool],
            scaler.inverse_transform(X_valid)[mask_bool]
        ))
        return rmse

In [None]:
# Loss function to mask NaNs in the original data
class MaskedMSELoss(nn.Module):
    # Mask should be 1 for masked value, 0 for unmasked value 
    def __init__(self):
        super().__init__()
        self.loss = nn.MSELoss(reduction='none')
    
    def forward(self, inputs, target, mask):
        loss = self.loss(inputs, target)
        return torch.mean(loss * (1 - mask))

In [None]:
# Defining model parameters and learning rate schedule

EPOCHS = 300
LR_START = 0.001
LR_END = 0.00005
BATCH_SIZE = 4096

# This cosine decay function is borrowed from AmbrosM in last month's TPS
def cosine_decay(epoch):
    epochs = EPOCHS
    lr_start = LR_START
    lr_end = LR_END
    if epochs > 1:
        w = (1 + math.cos(epoch / (epochs-1) * math.pi)) / 2
    else:
        w = 1
    return w * lr_start + (1 - w) * lr_end

In [None]:
# Build model

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0.01)

# Build model
device = 'mps'

# params
cols_n = X_train.shape[1]

# Final model uses units = [2048, 2048, 2048, 1024, 512, 256, 128], but I use a smaller model for this notebook
model = MaskedAutoencoder(cols_n, 15,units=[512, 512, 512, 512, 512, 256, 128]).to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=cosine_decay)
loss_fn = MaskedMSELoss()

In [None]:
# Training loop

# for epoch in epochs...

np.random.seed(6)

n = X_train.shape[0]

batch_size = 4096
n_batches = n // batch_size + 1
index = np.arange(n)

valid_per = 5

# Validation Mask
validation_masks = [mask_n_rows(X_valid.shape, i+1) for i in range(5)]
validation_prob = list(data.loc[:,:].isna().sum(axis=1).value_counts() \
    / data.loc[data.loc[:,:].isna().sum(axis=1)>0, :].isna().sum(axis=1).value_counts().sum())[1:]

c_scores = [np.zeros(EPOCHS) for i in range(len(validation_masks))]
f_scores = np.zeros(EPOCHS)

# Training loop
model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1} LR {optimizer.param_groups[0]['lr']}")
    
    np.random.shuffle(index)
    losses = 0
    norm_losses = 0
    for i in tqdm(range(n_batches)):
        batch_idx = index[i*batch_size:(i+1)*batch_size]
        # Create batch train data
        srce_mask = torch.tensor(srce_nan_train[batch_idx].astype(np.float32)).to(device)
        x = torch.tensor(X_train[batch_idx].astype(np.float32)).to(device)
        mask_init = torch.tensor(random_mask(x.shape, binomial_p=0.05).astype(np.float32)).to(device)
        mask = mask_init - srce_mask * mask_init
        x_masked = x * mask

        # Forward and backward pass
        optimizer.zero_grad()
        p = model(x_masked, mask)
        loss = loss_fn(p, x, srce_mask)
        loss.backward()
        optimizer.step()
        
        losses += loss # Check
    scheduler.step()
        
        
    # Validation stepb
    if (epoch + 1) % valid_per == 0:
        scores = []
        for i in range(len(validation_masks)):
            v = validate(model, validation_masks[i])
            scores.append(v)
            c_scores[i][epoch] = v
            
        final_score = math.sqrt(sum([scores[i]**2 * validation_prob[i] for i in range(len(scores))]))
        f_scores[epoch] = final_score
        
        for i in range(len(scores)):
            print(f'RMSE ({i+1} rows) {scores[i]}')
        print(f'RMSE (TDGP) {final_score}')