In [220]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import TargetEncoder
from sklearn.feature_selection import f_classif, mutual_info_classif, RFE, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.manifold import TSNE
import umap
import numpy as np


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [2]:
df = pd.read_csv('data_science_job.csv')
df.drop('enrollee_id', axis=1, inplace=True)

In [3]:
df.isna().sum()

city                         0
city_development_index     479
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
training_hours             766
target                       0
dtype: int64

In [4]:
# Fill nan values
df['city_development_index'] = df['city_development_index'].fillna(df['city_development_index'].median())
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['enrolled_university'] = df['enrolled_university'].fillna(df['enrolled_university'].mode()[0])
df['education_level'] = df['education_level'].fillna(df['education_level'].mode()[0])
df['major_discipline'] = df['major_discipline'].fillna(df['major_discipline'].mode()[0])
df['experience'] = df['experience'].fillna(df['experience'].median())
df['company_size'] = df['company_size'].fillna(df['company_size'].mode()[0])
df['company_type'] = df['company_type'].fillna(df['company_type'].mode()[0])
df['training_hours'] = df['training_hours'].fillna(df['training_hours'].median())

In [5]:
df.isna().sum()

city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
training_hours            0
target                    0
dtype: int64

In [6]:
df['target'] = df['target'].astype('boolean')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  object 
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  float64
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  training_hours          19158 non-null  float64
 11  target                  19158 non-null  boolean
dtypes: boolean(1), float64(3), object(8)
memory usage: 1.6+ MB


In [8]:
tg = TargetEncoder(smooth='auto', target_type='binary')
y = df['target']
x = df.drop(list(df.select_dtypes(exclude='object').columns), axis=1)

x_trans = tg.fit_transform(x, y)
df_encoded = pd.DataFrame(x_trans, columns=x.columns)

In [9]:
df_encoded['target'] = y
df_encoded

Unnamed: 0,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,company_size,company_type,target
0,0.210841,0.248272,0.213115,0.215456,0.278311,0.250955,0.330624,0.261022,True
1,0.149396,0.248272,0.341732,0.215456,0.278311,0.250955,0.330624,0.261022,False
2,0.585412,0.248540,0.334007,0.378715,0.275560,0.251229,0.326341,0.260967,False
3,0.358828,0.248540,0.334007,0.213721,0.275560,0.229394,0.326341,0.260967,True
4,0.311754,0.248168,0.213811,0.214452,0.213910,0.250018,0.326683,0.141869,False
...,...,...,...,...,...,...,...,...,...
19153,0.092249,0.248044,0.334392,0.215091,0.280080,0.209107,0.325053,0.259948,True
19154,0.210841,0.248272,0.213115,0.215456,0.278311,0.250955,0.330624,0.261022,True
19155,0.214336,0.248044,0.216072,0.215091,0.280080,0.251319,0.325053,0.259948,False
19156,0.138417,0.248272,0.213115,0.215456,0.197931,0.250955,0.170747,0.261022,False


In [10]:
fig = px.imshow(
    df_encoded.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [11]:
final_df = df
final_df[df_encoded.columns] = df_encoded
final_df

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,0.210841,0.920,0.248272,0.213115,0.215456,0.278311,0.250955,20.0,0.330624,0.261022,36.0,True
1,0.149396,0.776,0.248272,0.341732,0.215456,0.278311,0.250955,15.0,0.330624,0.261022,47.0,False
2,0.585412,0.624,0.248540,0.334007,0.378715,0.275560,0.251229,5.0,0.326341,0.260967,83.0,False
3,0.358828,0.789,0.248540,0.334007,0.213721,0.275560,0.229394,0.0,0.326341,0.260967,52.0,True
4,0.311754,0.767,0.248168,0.213811,0.214452,0.213910,0.250018,20.0,0.326683,0.141869,8.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
19153,0.092249,0.878,0.248044,0.334392,0.215091,0.280080,0.209107,14.0,0.325053,0.259948,42.0,True
19154,0.210841,0.920,0.248272,0.213115,0.215456,0.278311,0.250955,14.0,0.330624,0.261022,52.0,True
19155,0.214336,0.920,0.248044,0.216072,0.215091,0.280080,0.251319,20.0,0.325053,0.259948,44.0,False
19156,0.138417,0.802,0.248272,0.213115,0.215456,0.197931,0.250955,0.0,0.170747,0.261022,97.0,False


In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  float64
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  float64
 3   relevent_experience     19158 non-null  float64
 4   enrolled_university     19158 non-null  float64
 5   education_level         19158 non-null  float64
 6   major_discipline        19158 non-null  float64
 7   experience              19158 non-null  float64
 8   company_size            19158 non-null  float64
 9   company_type            19158 non-null  float64
 10  training_hours          19158 non-null  float64
 11  target                  19158 non-null  boolean
dtypes: boolean(1), float64(11)
memory usage: 1.6 MB


In [13]:
# Part 1 Task 1
fig = px.imshow(
    final_df.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [73]:
# Part 1 Task 2
def rforest_feature_selection(df, target_col, threshold = 0.1):
    model = xgb.XGBRFClassifier()
    x = df.drop(target_col, axis=1)
    y = df[target_col]
    model.fit(x, y)
    features_importances = dict(zip(x.columns.to_list(), list(model.feature_importances_)))
    return dict(filter(lambda pair: pair[1] >= threshold, features_importances.items()))

In [75]:
# Part 1 Task 3
def p_value_feature_selection(df, target_col):
    x = df.drop(target_col, axis=1)
    y = df[target_col]
    f_test = f_classif(x, y)
    p_values = f_test[1]
    features_p_values = dict(zip(x.columns.to_list(), list(p_values)))
    return dict(filter(lambda pair: pair[1] < 0.05, features_p_values.items()))

In [78]:
# Part 1 Task 4
def mutual_info_feature_selection(df, target_col, percentile=0.5):
    x = df.drop(target_col, axis=1)
    y = df[target_col]
    features_importances = mutual_info_classif(x, y)    
    features_importances = dict(zip(x.columns.to_list(), list(features_importances)))
    features_importances = dict(sorted(features_importances.items(),
                                       key=lambda pair: pair[0])[:round(len(features_importances)*percentile)])
    return features_importances

In [80]:
# Part 1 Task 5
def rfe_feature_selection(df, target_col, percentile=0.5):
    x = df.drop(target_col, axis=1)
    y = df[target_col]    
    model = RFE(estimator=SVC(kernel='linear'), n_features_to_select=percentile)
    model.fit(x, y)
    return {feature: True for feature in model.get_feature_names_out()}

In [17]:
# Some selected features from data science df 'cause rfe method is kinda slow
rfe_selected = {'city': True,
                'city_development_index': True,
                'enrolled_university': True,
                'company_size': True,
                'company_type': True}

In [74]:
# Part 1 Task 6
# Random Forest
model = xgb.XGBClassifier()
x = final_df[list(rforest_feature_selection(
    final_df, 'target', 0.02
).keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.83      0.88      0.85      2877
         1.0       0.56      0.45      0.50       955

    accuracy                           0.77      3832
   macro avg       0.69      0.66      0.67      3832
weighted avg       0.76      0.77      0.76      3832



In [76]:
# Part 1 Task 6
# P-Value
model = xgb.XGBClassifier()
x = final_df[list(p_value_feature_selection(
    final_df, 'target'
).keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.88      0.85      2877
         1.0       0.54      0.43      0.48       955

    accuracy                           0.77      3832
   macro avg       0.68      0.66      0.66      3832
weighted avg       0.75      0.77      0.76      3832



In [79]:
# Part 1 Task 6
# Mutual Info
model = xgb.XGBClassifier()
x = final_df[list(mutual_info_feature_selection(
    final_df, 'target'
).keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      2877
         1.0       0.55      0.41      0.47       955

    accuracy                           0.77      3832
   macro avg       0.69      0.65      0.66      3832
weighted avg       0.75      0.77      0.76      3832



In [23]:
# Part 1 Task 6
# RFE
model = xgb.XGBClassifier()
x = final_df[list(rfe_selected.keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      2877
         1.0       0.55      0.41      0.47       955

    accuracy                           0.77      3832
   macro avg       0.69      0.65      0.66      3832
weighted avg       0.75      0.77      0.76      3832



In [None]:
# RForest is the best for now

In [16]:
body_df = pd.read_csv('bodies_dataset.csv')

In [17]:
# Part 2 Task 1
fig = px.imshow(
    body_df.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [20]:
body_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bia_di  507 non-null    float64
 1   bii_di  507 non-null    float64
 2   bit_di  507 non-null    float64
 3   che_de  507 non-null    float64
 4   che_di  507 non-null    float64
 5   elb_di  507 non-null    float64
 6   wri_di  507 non-null    float64
 7   kne_di  507 non-null    float64
 8   ank_di  507 non-null    float64
 9   sho_gi  507 non-null    float64
 10  che_gi  507 non-null    float64
 11  wai_gi  507 non-null    float64
 12  nav_gi  507 non-null    float64
 13  hip_gi  507 non-null    float64
 14  thi_gi  507 non-null    float64
 15  bic_gi  507 non-null    float64
 16  for_gi  507 non-null    float64
 17  kne_gi  507 non-null    float64
 18  cal_gi  507 non-null    float64
 19  ank_gi  507 non-null    float64
 20  wri_gi  507 non-null    float64
 21  age     507 non-null    int64  
 22  wg

In [19]:
body_df['sex'] = body_df['sex'].astype('boolean')

In [24]:
# Basic prediction. 1 - Male, 0 - Female
model = xgb.XGBClassifier()
x = body_df.drop('sex', axis=1)
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96        52
         1.0       0.98      0.94      0.96        50

    accuracy                           0.96       102
   macro avg       0.96      0.96      0.96       102
weighted avg       0.96      0.96      0.96       102



In [28]:
# Part 2 Task 2
def columns_to_exclude(feature_df, corr_algo='pearson', exclude_treshold=0.5):
    corr_matrix = feature_df.corr(method=corr_algo).abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))   
    to_drop = [column for column in upper.columns if any(upper[column] > exclude_treshold)]
    return to_drop

In [44]:
# Part 2 Task 2 prediction
model = xgb.XGBClassifier()
excluded_cols = columns_to_exclude(body_df.drop('sex', axis=1), exclude_treshold=0.9)
x = body_df.drop(excluded_cols + ['sex'], axis=1)
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98        52
         1.0       0.96      1.00      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102



In [64]:
# Part 2 Task 3
def select_most_corr_with_target(df, target_name, corr_algo='pearson', percentile=0.5):
    target_col: pd.Series = df.corr(method=corr_algo).abs()[target_name]
    best_cols = (target_col.sort_values(ascending=False).iloc[1:]
                 .head(round(len(target_col)*percentile)))
    return best_cols.index.to_list()
select_most_corr_with_target(body_df, 'sex')
    

['for_gi',
 'sho_gi',
 'bia_di',
 'elb_di',
 'wri_gi',
 'che_gi',
 'bic_gi',
 'wri_di',
 'che_di',
 'ank_di',
 'hgt',
 'wai_gi']

In [87]:
# Part 2 task 3 Prediction
model = xgb.XGBClassifier()
selected_cols = select_most_corr_with_target(body_df, 'sex', percentile=0.8)
x = body_df.drop('sex', axis=1)[selected_cols]
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97        52
         1.0       0.98      0.96      0.97        50

    accuracy                           0.97       102
   macro avg       0.97      0.97      0.97       102
weighted avg       0.97      0.97      0.97       102



In [71]:
# Part 2 Task 4 Prediction
model = xgb.XGBClassifier()
excluded_cols = columns_to_exclude(body_df.drop('sex', axis=1), exclude_treshold=0.9, corr_algo='spearman')
x = body_df.drop(excluded_cols + ['sex'], axis=1)
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        52
         1.0       1.00      0.96      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102



In [154]:
# Some test prediction for body dataset with mutual info feature selection
model = xgb.XGBClassifier()
x = body_df[list(mutual_info_feature_selection(
    body_df, 'sex'
).keys())]
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97        52
         1.0       0.96      0.98      0.97        50

    accuracy                           0.97       102
   macro avg       0.97      0.97      0.97       102
weighted avg       0.97      0.97      0.97       102



In [156]:
# Part 2 Task 6 Log normalization
x = body_df.drop('sex', axis=1)
for col in x.columns:
    x[col] = x[col].apply(lambda val: np.log(val) if val != 0 else 0)
y = body_df['sex']
fig = px.imshow(
    pd.concat([x, y], axis=1).corr(), text_auto=True, aspect='auto'
)
fig.show()

In [157]:
# Some prediction for Part 2 task 6
x = body_df.drop('sex', axis=1)
for col in x.columns:
    x[col] = x[col].apply(lambda val: np.log(val) if val != 0 else 0)
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98        52
         1.0       0.96      1.00      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102



In [164]:
# Part 3 Task 1
def rfe_feature_selection_general(df, target_col, estimator, percentile=0.5):
    x = df.drop(target_col, axis=1)
    y = df[target_col]    
    model = RFE(estimator=estimator, n_features_to_select=percentile)
    model.fit(x, y)
    return {feature: True for feature in model.get_feature_names_out()}

In [168]:
# Part 3 Task 1
model = xgb.XGBClassifier()
selected_features = list(rfe_feature_selection_general(body_df, 'sex', LogisticRegression()).keys())
x = body_df[selected_features]
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        52
         1.0       1.00      0.96      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [170]:
# Part 3 Task 2
def seq_feature_selection_general(df, target_col, estimator, percentile=0.5):
    x = df.drop(target_col, axis=1)
    y = df[target_col]    
    model = SequentialFeatureSelector(estimator=estimator, n_features_to_select=percentile)
    model.fit(x, y)
    return {feature: True for feature in model.get_feature_names_out()}

In [171]:
# Part 3 Task 2
model = xgb.XGBClassifier()
selected_features = list(seq_feature_selection_general(body_df, 'sex', SVC(kernel='linear')).keys())
x = body_df[selected_features]
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98        52
         1.0       0.98      0.98      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102



In [173]:
# Part 3 Task 3
selected_features = list(seq_feature_selection_general(body_df, 'sex', SVC(kernel='linear')).keys())
x = body_df[selected_features]
y = body_df['sex']
params = {'objective':'binary:logistic',
          'colsample_bytree': 0.3,
          'learning_rate': 0.1,
          'max_depth': 5,
          'alpha': 10}
xgb_matrix = xgb.DMatrix(data=x, label=y)
xgb_cv = xgb.cv(dtrain=xgb_matrix, params=params, nfold=3,
                num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)
xgb_cv.tail(10)
# Pretty good hyperparameters and feature selection because auc is good
# But that is not unusual as this dataset is very simple with clean data

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
40,0.994054,0.000623,0.983727,0.004057
41,0.994194,0.000576,0.98368,0.004008
42,0.994159,0.0006,0.983773,0.003882
43,0.994159,0.0006,0.983773,0.003882
44,0.994194,0.000623,0.983866,0.003991
45,0.994369,0.000673,0.984287,0.0038
46,0.994358,0.000666,0.984287,0.0038
47,0.994439,0.000723,0.984427,0.00377
48,0.994498,0.000744,0.984568,0.004073
49,0.994498,0.000722,0.984615,0.004009


In [None]:
# There are pretty many examples for Part 3 Task 4, 5

In [182]:
# Part 4 Task 1
pca = PCA(n_components=5)
x = pca.fit_transform(body_df.drop('sex', axis=1))
pca_df = pd.DataFrame(data=x, columns=[f'PC_{i}' for i in range(x.shape[1])])
pca_df = pd.concat([pca_df, body_df['sex']], axis=1)
pca_df.head(10)

Unnamed: 0,PC_0,PC_1,PC_2,PC_3,PC_4,sex
0,-10.490298,-13.659059,-2.865011,-0.705562,0.745252,True
1,5.823066,-9.048384,-0.550563,0.561438,-4.259364,True
2,18.646961,-14.379482,-7.898175,-11.55864,-1.388464,True
3,3.264429,-14.930076,-5.264511,-9.780748,-4.167361,True
4,11.998789,-14.640156,-0.422695,-10.526607,-1.78166,True
5,13.116381,-17.191444,-3.242018,1.854384,-0.421257,True
6,29.911921,-15.348237,-1.87551,0.271622,7.993347,True
7,16.656044,-14.988888,-6.693347,-2.883872,4.958083,True
8,-13.338402,-17.081836,-10.139796,1.818015,3.529042,True
9,15.322727,-17.308233,1.481761,-5.697221,6.250296,True


In [183]:
fig = px.imshow(
    pca_df.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [187]:
# Part 4 Task 1 prediction
model = xgb.XGBClassifier()
x = pca_df.drop('sex', axis=1)
y = pca_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96        52
         1.0       0.98      0.94      0.96        50

    accuracy                           0.96       102
   macro avg       0.96      0.96      0.96       102
weighted avg       0.96      0.96      0.96       102



In [185]:
# Part 4 Task 2
fct_analysis = FactorAnalysis(n_components=5)
x = fct_analysis.fit_transform(body_df.drop('sex', axis=1))
fct_df = pd.DataFrame(data=x, columns=[f'FCT_{i}' for i in range(x.shape[1])])
fct_df = pd.concat([fct_df, body_df['sex']], axis=1)
fct_df.head(10)

Unnamed: 0,FCT_0,FCT_1,FCT_2,FCT_3,FCT_4,sex
0,-0.221462,-1.090733,-0.819623,-0.063725,0.307778,True
1,0.414076,-1.197151,-0.670763,-0.581306,-0.81141,True
2,0.707939,-0.613972,-0.590315,-0.40367,1.463259,True
3,0.206736,-0.868786,-1.027632,-1.693469,0.549782,True
4,0.701544,-0.654559,-1.60408,-1.255173,-0.124161,True
5,0.542622,-0.880815,-0.344529,0.355252,1.354393,True
6,1.626882,-1.013501,-1.663049,1.558841,-0.766149,True
7,0.810047,-0.967865,-1.176365,-0.517873,0.329133,True
8,-0.31762,-1.800342,-1.065672,0.077525,0.192501,True
9,0.897955,-0.394139,-1.942368,0.249758,0.172522,True


In [186]:
fig = px.imshow(
    fct_df.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [188]:
# Part 4 Task 2 prediction
model = xgb.XGBClassifier()
x = fct_df.drop('sex', axis=1)
y = fct_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98        52
         1.0       0.98      0.98      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102



In [191]:
# Part 4 Task 3
x = body_df.drop('sex', axis=1)
for col in x.columns:
    x[col] = x[col].apply(lambda val: np.log(val) if val != 0 else 0)
x = PCA(n_components=5).fit_transform(x)
pca_norm_df = pd.DataFrame(data=x, columns=[f'PCA_{i}' for i in range(x.shape[1])])
y = body_df['sex']

In [194]:
# Part 4 Task 3 prediction
model = xgb.XGBClassifier()
x = pca_norm_df
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        52
         1.0       1.00      1.00      1.00        50

    accuracy                           1.00       102
   macro avg       1.00      1.00      1.00       102
weighted avg       1.00      1.00      1.00       102



In [201]:
# Part 4 Task 4 Linear Discriminant Analysis
x = final_df.drop('target', axis=1)
y = final_df['target']
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(x, y)
lda_data = lda.transform(x)
lda_df = pd.DataFrame(data=lda_data,
                      columns=[f'LDA_{i}' for i in range(lda_data.shape[1])])
lda_df

Unnamed: 0,LDA_0
0,0.069537
1,0.256200
2,2.774584
3,1.362764
4,0.035786
...,...
19153,0.063493
19154,0.076029
19155,0.063075
19156,-1.225021


In [202]:
# Part 4 Task 4 Prediction
model = xgb.XGBClassifier()
x = lda_df
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))



              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86      2877
         1.0       0.56      0.36      0.44       955

    accuracy                           0.77      3832
   macro avg       0.69      0.64      0.65      3832
weighted avg       0.75      0.77      0.75      3832



In [203]:
# Part 4 Task 4.2
x = body_df.drop('sex', axis=1)
y = body_df['sex']
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(x, y)
lda_data = lda.transform(x)
lda_df = pd.DataFrame(data=lda_data,
                      columns=[f'LDA_{i}' for i in range(lda_data.shape[1])])
lda_df

Unnamed: 0,LDA_0
0,2.066052
1,2.896961
2,3.290126
3,2.845850
4,2.841371
...,...
502,-2.523492
503,-2.354823
504,-2.238751
505,-2.168563


In [204]:
model = xgb.XGBClassifier()
x = lda_df
y = body_df['sex']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98        52
         1.0       0.96      1.00      0.98        50

    accuracy                           0.98       102
   macro avg       0.98      0.98      0.98       102
weighted avg       0.98      0.98      0.98       102



In [208]:
# Part 4 Task 4 QDA
x = final_df.drop('target', axis=1)
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
pred = qda.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.84      0.86      0.85      2877
         1.0       0.55      0.51      0.53       955

    accuracy                           0.77      3832
   macro avg       0.70      0.69      0.69      3832
weighted avg       0.77      0.77      0.77      3832



In [None]:
# Part 4 Task 5 is already presented higher because I'ValueError
# already made tons on predictions with LDA and PCA

In [219]:
# Part 5 Task 1-2
n_components = 2
learning_rate = 100
iters = 1000
x = body_df.drop('sex', axis=1)
y = body_df['sex']
tsne = TSNE(n_components=n_components, learning_rate=learning_rate,
            n_iter=250)
x_tsne = tsne.fit_transform(x)
tsne_df = pd.DataFrame(data=x_tsne,
                       columns=[f'TSNE_{i}' for i in range(n_components)])
tsne_df['sex'] = y


In [211]:
fig = px.scatter(tsne_df, x='TSNE_0', y='TSNE_1', color='sex')
fig.show()

In [216]:
pca = PCA(n_components=2)
x = pca.fit_transform(body_df.drop('sex', axis=1))
pca_df = pd.DataFrame(data=x, columns=[f'PCA_{i}' for i in range(x.shape[1])])
pca_df['sex'] = body_df['sex']
fig = px.scatter(pca_df, x='PCA_0', y='PCA_1', color='sex')
fig.show()

In [221]:
# Part 4 Task 4
x = body_df.drop('sex', axis=1)
y = body_df['sex']
umap_model = umap.UMAP()
umap_model.fit(x, y)
umap_data = umap_model.transform(x)
umap_df = pd.DataFrame(
    data=umap_data, columns=['UMAP_1', 'UMAP_2']
)
umap_df['sex'] = y


In [222]:
# Part 4 Task 5
fig = px.scatter(umap_df, x='UMAP_1', y='UMAP_2', color='sex')
fig.show()