In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split


In [2]:
# reading csv file
athlete_events = pd.read_csv('../CSV for ML models/athlete_events.csv')

In [3]:
athlete_events.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [4]:
filter_data = athlete_events[["Sex", "Age", "Height", "Weight", "Team", "Year", "Season", "Sport", "Event", "Medal"]]

In [5]:
filter_data.head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Event,Medal
0,M,24.0,180.0,80.0,China,1992,Summer,Basketball,Basketball Men's Basketball,
1,M,23.0,170.0,60.0,China,2012,Summer,Judo,Judo Men's Extra-Lightweight,
2,M,24.0,,,Denmark,1920,Summer,Football,Football Men's Football,
3,M,34.0,,,Denmark/Sweden,1900,Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,F,21.0,185.0,82.0,Netherlands,1988,Winter,Speed Skating,Speed Skating Women's 500 metres,


In [6]:
# get the data for winter Olympic
winter_data = filter_data[filter_data["Season"] == "Winter"]
winter_data.head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Event,Medal
4,F,21.0,185.0,82.0,Netherlands,1988,Winter,Speed Skating,Speed Skating Women's 500 metres,
5,F,21.0,185.0,82.0,Netherlands,1988,Winter,Speed Skating,"Speed Skating Women's 1,000 metres",
6,F,25.0,185.0,82.0,Netherlands,1992,Winter,Speed Skating,Speed Skating Women's 500 metres,
7,F,25.0,185.0,82.0,Netherlands,1992,Winter,Speed Skating,"Speed Skating Women's 1,000 metres",
8,F,27.0,185.0,82.0,Netherlands,1994,Winter,Speed Skating,Speed Skating Women's 500 metres,


In [7]:
# get the data for summer Olympic
summer_data = filter_data[(filter_data["Season"] == "Summer")]
summer_data.head()



Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Event,Medal
0,M,24.0,180.0,80.0,China,1992,Summer,Basketball,Basketball Men's Basketball,
1,M,23.0,170.0,60.0,China,2012,Summer,Judo,Judo Men's Extra-Lightweight,
2,M,24.0,,,Denmark,1920,Summer,Football,Football Men's Football,
3,M,34.0,,,Denmark/Sweden,1900,Summer,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
26,F,18.0,168.0,,Netherlands,1932,Summer,Athletics,Athletics Women's 100 metres,


In [8]:
# print out every sport in the summer Olympic
summer_data["Sport"].unique()

array(['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Athletics',
       'Swimming', 'Badminton', 'Sailing', 'Gymnastics',
       'Art Competitions', 'Handball', 'Weightlifting', 'Wrestling',
       'Water Polo', 'Hockey', 'Rowing', 'Fencing', 'Equestrianism',
       'Shooting', 'Boxing', 'Taekwondo', 'Cycling', 'Diving', 'Canoeing',
       'Tennis', 'Modern Pentathlon', 'Golf', 'Softball', 'Archery',
       'Volleyball', 'Synchronized Swimming', 'Table Tennis', 'Baseball',
       'Rhythmic Gymnastics', 'Rugby Sevens', 'Trampolining',
       'Beach Volleyball', 'Triathlon', 'Rugby', 'Lacrosse', 'Polo',
       'Cricket', 'Ice Hockey', 'Racquets', 'Motorboating', 'Croquet',
       'Figure Skating', 'Jeu De Paume', 'Roque', 'Basque Pelota',
       'Alpinism', 'Aeronautics'], dtype=object)

In [9]:
summer_sports = summer_data["Sport"].unique()

In [10]:
# For loop to train the Logistic Regression model and get the testing score for male athletes in every Summer Olympic Sport

for sport in summer_sports:
    try:
        athlete_m = summer_data[(summer_data["Sport"] == f'{sport}') & (summer_data["Sex"] == "M")]
        athlete_m = athlete_m[["Age", "Height", "Weight", "Medal"]]
        athlete_m = athlete_m.dropna(subset=['Height', 'Weight']).reset_index(drop = True)
        
        # One-hot encoding                
        athlete_m = athlete_m.replace("Gold", 1)
        athlete_m = athlete_m.replace("Silver", 1)
        athlete_m = athlete_m.replace("Bronze", 1)
        athlete_m["Medal"] = athlete_m["Medal"].fillna(0)
        athlete_m = athlete_m.dropna()
        athlete_m["Medal"].unique()
        
        # Assign X (data) and y (target)        
        X1 = athlete_m[['Height', "Weight", "Age"]] 
        print(X1.shape) 
        y1 = athlete_m['Medal'] 
        print(y1.shape)
        print(sport)

        # Split our data into training and testing
        X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42)
        
        # Create a Logistic Regression Model
        classifier = LogisticRegression() 
        
        #Fit (train) or model using the training data
        classifier.fit(X1_train, y1_train)
        
        #Validate the model using the test data
        print(f"Training Data Score: {classifier.score(X1_train, y1_train)}") 
        print(f"Testing Data Score: {classifier.score(X1_test, y1_test)}")
        
        # Predict the testing data point
        predictions = classifier.predict(X1_test)

        pd.DataFrame({"Prediction": predictions, "Actual": y1_test})
        print("----------------------------------------------------")
    except:
        print("An exception occurred")
        print("----------------------------------------------------")

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(2461, 3)
(2461,)
Basketball
Training Data Score: 0.7550135501355013
Testing Data Score: 0.737012987012987
----------------------------------------------------
(2369, 3)
(2369,)
Judo
Training Data Score: 0.8552927927927928
Testing Data Score: 0.8600337268128162
----------------------------------------------------
(3459, 3)
(3459,)
Football
Training Data Score: 0.776792598303778
Testing Data Score: 0.7641618497109827
----------------------------------------------------
(20, 3)
(20,)
Tug-Of-War
Training Data Score: 0.8666666666666667
Testing Data Score: 0.6
----------------------------------------------------
(21391, 3)
(21391,)
Athletics
Training Data Score: 0.8901701676743751
Testing Data Score: 0.8790201944652206
----------------------------------------------------
(10321, 3)
(10321,)
Swimming
Training Data Score: 0.8717054263565891
Testing Data Score: 0.8736923672994963
----------------------------------------------------
(702, 3)
(702,)
Badminton
Training Data Score: 0.8821292775665

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(10696, 3)
(10696,)
Gymnastics
Training Data Score: 0.9396659187235104
Testing Data Score: 0.9367988032909499
----------------------------------------------------
(28, 3)
(28,)
Art Competitions
Training Data Score: 0.9047619047619048
Testing Data Score: 1.0
----------------------------------------------------
(2036, 3)
(2036,)
Handball
Training Data Score: 0.7550753110674525
Testing Data Score: 0.724950884086444
----------------------------------------------------


  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(2514, 3)
(2514,)
Weightlifting
Training Data Score: 0.830238726790451
Testing Data Score: 0.8298887122416534
----------------------------------------------------
(4883, 3)
(4883,)
Wrestling
Training Data Score: 0.8156744948115784
Testing Data Score: 0.8165438165438166
----------------------------------------------------
(2231, 3)
(2231,)
Water Polo
Training Data Score: 0.7423789599521817
Testing Data Score: 0.7455197132616488
----------------------------------------------------
(2874, 3)
(2874,)
Hockey
Training Data Score: 0.748491879350348
Testing Data Score: 0.760778859527121
----------------------------------------------------
(5667, 3)
(5667,)
Rowing
Training Data Score: 0.7534117647058823
Testing Data Score: 0.7544107268877911
----------------------------------------------------
(4763, 3)
(4763,)
Fencing
Training Data Score: 0.8331466965285554
Testing Data Score: 0.8429890848026869
----------------------------------------------------
(3400, 3)
(3400,)
Equestrianism
Training Data 

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(5596, 3)
(5596,)
Shooting
Training Data Score: 0.9306647605432452
Testing Data Score: 0.9242315939957112
----------------------------------------------------
(4304, 3)
(4304,)
Boxing
Training Data Score: 0.8379801734820322
Testing Data Score: 0.8494423791821561
----------------------------------------------------
(306, 3)
(306,)
Taekwondo
Training Data Score: 0.759825327510917
Testing Data Score: 0.7662337662337663
----------------------------------------------------
(6437, 3)
(6437,)
Cycling
Training Data Score: 0.9011808576755749
Testing Data Score: 0.8807453416149068
----------------------------------------------------
(1043, 3)
(1043,)
Diving
Training Data Score: 0.8465473145780051
Testing Data Score: 0.8582375478927203
----------------------------------------------------
(4270, 3)
(4270,)
Canoeing
Training Data Score: 0.8251093066833229
Testing Data Score: 0.8033707865168539
----------------------------------------------------
(1044, 3)
(1044,)
Tennis
Training Data Score: 0.92464

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(1095, 3)
(1095,)
Modern Pentathlon
Training Data Score: 0.8745432399512789
Testing Data Score: 0.8722627737226277
----------------------------------------------------
(53, 3)
(53,)
Golf
Training Data Score: 1.0
Testing Data Score: 0.8571428571428571
----------------------------------------------------
(0, 3)
(0,)
Softball
An exception occurred
----------------------------------------------------
(1000, 3)
(1000,)
Archery
Training Data Score: 0.892
Testing Data Score: 0.9
----------------------------------------------------
(1782, 3)
(1782,)
Volleyball
Training Data Score: 0.7357784431137725
Testing Data Score: 0.679372197309417
----------------------------------------------------
(0, 3)
(0,)
Synchronized Swimming
An exception occurred
----------------------------------------------------
(927, 3)
(927,)
Table Tennis
Training Data Score: 0.9064748201438849
Testing Data Score: 0.9224137931034483
----------------------------------------------------
(846, 3)
(846,)
Baseball
Training Data S

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(74, 3)
(74,)
Trampolining
Training Data Score: 0.7454545454545455
Testing Data Score: 0.9473684210526315
----------------------------------------------------
(273, 3)
(273,)
Beach Volleyball
Training Data Score: 0.8676470588235294
Testing Data Score: 0.9130434782608695
----------------------------------------------------
(265, 3)
(265,)
Triathlon
Training Data Score: 0.9545454545454546
Testing Data Score: 0.9104477611940298
----------------------------------------------------
(30, 3)
(30,)
Rugby
An exception occurred
----------------------------------------------------
(2, 3)
(2,)
Lacrosse
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Polo
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Cricket
An exception occurred
----------------------------------------------------


  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Silver", 1)
  athlete_m = athlete_m.replace("Gold", 1)


(18, 3)
(18,)
Ice Hockey
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Racquets
An exception occurred
----------------------------------------------------
(1, 3)
(1,)
Motorboating
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Croquet
An exception occurred
----------------------------------------------------
(1, 3)
(1,)
Figure Skating
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Jeu De Paume
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Roque
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Basque Pelota
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Alpinism
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Aeronautics
An exception occurred
----------------------------------------------------


  athlete_m = athlete_m.replace("Silver", 1)
  athlete_m["Medal"] = athlete_m["Medal"].fillna(0)
  athlete_m = athlete_m.replace("Gold", 1)


In [11]:
# For loop to train the Logistic Regression model and get the testing score for female athletes in every Summer Olympic Sport

for sport in summer_sports:
    try:
        athlete_f = summer_data[(summer_data["Sport"] == f'{sport}') & (summer_data["Sex"] == "F")]
        athlete_f = athlete_f[["Age", "Height", "Weight", "Medal"]]
        athlete_f = athlete_f.dropna(subset=['Height', 'Weight']).reset_index(drop = True)
        
        # One-hot encoding        
        athlete_f = athlete_f.replace("Gold", 1)
        athlete_f = athlete_f.replace("Silver", 1)
        athlete_f = athlete_f.replace("Bronze", 1)
        athlete_f["Medal"] = athlete_f["Medal"].fillna(0)
        athlete_f = athlete_f.dropna()
        athlete_f["Medal"].unique()
        
        # Assign X (data) and y (target)
        X = athlete_f[['Height', "Weight", "Age"]] 
        print(X.shape) 
        y = athlete_f['Medal'] 
        print(y.shape)
        print(sport)

        # Split our data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        
        # Create a Logistic Regression Model
        classifier = LogisticRegression() 
        classifier
        
        # Fit (train) or model using the training data
        classifier.fit(X_train, y_train)
        
        # Validate the model using the test data
        print(f"Training Data Score: {classifier.score(X_train, y_train)}") 
        print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
        
        # Predict the testing data point
        predictions = classifier.predict(X_test)

        pd.DataFrame({"Prediction": predictions, "Actual": y_test})
        print("----------------------------------------------------")
    except:
        print("An exception occurred")
        print("----------------------------------------------------")

  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(1207, 3)
(1207,)
Basketball
Training Data Score: 0.6740331491712708
Testing Data Score: 0.6920529801324503
----------------------------------------------------
(997, 3)
(997,)
Judo
Training Data Score: 0.8112449799196787
Testing Data Score: 0.812
----------------------------------------------------
(984, 3)
(984,)
Football
Training Data Score: 0.6869918699186992
Testing Data Score: 0.7235772357723578
----------------------------------------------------
(0, 3)
(0,)
Tug-Of-War
An exception occurred
----------------------------------------------------
(10983, 3)
(10983,)
Athletics
Training Data Score: 0.8908583222046862
Testing Data Score: 0.8761835396941005
----------------------------------------------------


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(8455, 3)
(8455,)
Swimming
Training Data Score: 0.8667402617883615
Testing Data Score: 0.847682119205298
----------------------------------------------------
(660, 3)
(660,)
Badminton
Training Data Score: 0.8868686868686869
Testing Data Score: 0.896969696969697
----------------------------------------------------
(871, 3)
(871,)
Sailing
Training Data Score: 0.8683001531393568
Testing Data Score: 0.8211009174311926
----------------------------------------------------
(7575, 3)
(7575,)
Gymnastics
Training Data Score: 0.9295898609399753
Testing Data Score: 0.9429778247096093
----------------------------------------------------
(0, 3)
(0,)
Art Competitions
An exception occurred
----------------------------------------------------
(1283, 3)
(1283,)
Handball
Training Data Score: 0.6694386694386695
Testing Data Score: 0.6791277258566978
----------------------------------------------------
(460, 3)
(460,)
Weightlifting
Training Data Score: 0.7652173913043478
Testing Data Score: 0.7913043478260

  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(488, 3)
(488,)
Water Polo
Training Data Score: 0.6038251366120219
Testing Data Score: 0.680327868852459
----------------------------------------------------
(1390, 3)
(1390,)
Hockey
Training Data Score: 0.6794625719769674
Testing Data Score: 0.6609195402298851
----------------------------------------------------
(2123, 3)
(2123,)
Rowing
Training Data Score: 0.660175879396985
Testing Data Score: 0.672316384180791
----------------------------------------------------
(1774, 3)
(1774,)
Fencing
Training Data Score: 0.8180451127819549
Testing Data Score: 0.8108108108108109
----------------------------------------------------
(1153, 3)
(1153,)
Equestrianism


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


Training Data Score: 0.8263888888888888
Testing Data Score: 0.8546712802768166
----------------------------------------------------
(1664, 3)
(1664,)
Shooting
Training Data Score: 0.9126602564102564
Testing Data Score: 0.9134615384615384
----------------------------------------------------
(59, 3)
(59,)
Boxing
Training Data Score: 0.6363636363636364
Testing Data Score: 0.5333333333333333
----------------------------------------------------
(290, 3)
(290,)
Taekwondo
Training Data Score: 0.7373271889400922
Testing Data Score: 0.7945205479452054
----------------------------------------------------


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(1338, 3)
(1338,)
Cycling
Training Data Score: 0.8594217347956131
Testing Data Score: 0.8865671641791045
----------------------------------------------------
(968, 3)
(968,)
Diving
Training Data Score: 0.8195592286501377
Testing Data Score: 0.8760330578512396
----------------------------------------------------
(1280, 3)
(1280,)
Canoeing
Training Data Score: 0.7791666666666667
Testing Data Score: 0.815625
----------------------------------------------------
(964, 3)
(964,)
Tennis
Training Data Score: 0.9156293222683264
Testing Data Score: 0.9045643153526971
----------------------------------------------------
(164, 3)
(164,)
Modern Pentathlon


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


Training Data Score: 0.9105691056910569
Testing Data Score: 0.9024390243902439
----------------------------------------------------
(55, 3)
(55,)
Golf
Training Data Score: 0.975609756097561
Testing Data Score: 0.9285714285714286
----------------------------------------------------
(441, 3)
(441,)
Softball
Training Data Score: 0.6090909090909091
Testing Data Score: 0.6936936936936937
----------------------------------------------------
(883, 3)
(883,)
Archery
Training Data Score: 0.8867069486404834
Testing Data Score: 0.8868778280542986
----------------------------------------------------
(1495, 3)
(1495,)
Volleyball
Training Data Score: 0.6726137377341659
Testing Data Score: 0.7058823529411765
----------------------------------------------------


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(849, 3)
(849,)
Synchronized Swimming
Training Data Score: 0.7531446540880503
Testing Data Score: 0.7183098591549296
----------------------------------------------------
(862, 3)
(862,)
Table Tennis
Training Data Score: 0.9040247678018576
Testing Data Score: 0.9166666666666666
----------------------------------------------------
(0, 3)
(0,)
Baseball
An exception occurred
----------------------------------------------------
(615, 3)
(615,)
Rhythmic Gymnastics
Training Data Score: 0.7982646420824295
Testing Data Score: 0.8181818181818182
----------------------------------------------------


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(146, 3)
(146,)
Rugby Sevens
Training Data Score: 0.7798165137614679
Testing Data Score: 0.6756756756756757
----------------------------------------------------
(75, 3)
(75,)
Trampolining
Training Data Score: 0.7857142857142857
Testing Data Score: 0.8947368421052632
----------------------------------------------------
(265, 3)
(265,)
Beach Volleyball
Training Data Score: 0.8838383838383839
Testing Data Score: 0.8208955223880597
----------------------------------------------------
(261, 3)
(261,)
Triathlon
Training Data Score: 0.9384615384615385
Testing Data Score: 0.9545454545454546
----------------------------------------------------
(0, 3)
(0,)
Rugby
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Lacrosse
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Polo
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Cricket
An exception occurred
----------------------------------

  athlete_f = athlete_f.replace("Gold", 1)


In [12]:
# print out every sport in the winter Olympic
winter_data['Sport'].unique()

array(['Speed Skating', 'Cross Country Skiing', 'Ice Hockey', 'Biathlon',
       'Alpine Skiing', 'Luge', 'Bobsleigh', 'Figure Skating',
       'Nordic Combined', 'Freestyle Skiing', 'Ski Jumping', 'Curling',
       'Snowboarding', 'Short Track Speed Skating', 'Skeleton',
       'Military Ski Patrol', 'Alpinism'], dtype=object)

In [13]:
winter_sports = winter_data['Sport'].unique()

In [14]:
# For loop to train the Logistic Regression model and get the testing score for male athletes in every Winter Olympic Sport
for sport in winter_sports:
    try:
        athlete_m = winter_data[(winter_data["Sport"] == f'{sport}') & (winter_data["Sex"] == "M")]
        athlete_m = athlete_m[["Age", "Height", "Weight", "Medal"]]
        athlete_m = athlete_m.dropna(subset=['Height', 'Weight']).reset_index(drop = True)
        
        # One-hot encoding
        athlete_m = athlete_m.replace("Gold", 1)
        athlete_m = athlete_m.replace("Silver", 1)
        athlete_m = athlete_m.replace("Bronze", 1)
        athlete_m["Medal"] = athlete_m["Medal"].fillna(0)
        athlete_m = athlete_m.dropna()
        athlete_m["Medal"].unique()
        
        # Assign X (data) and y (target)
        X = athlete_m[['Height', "Weight", "Age"]] 
        print(X.shape) 
        y = athlete_m['Medal'] 
        print(y.shape)
        print(sport)
        
        # Split our data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        
        # Create a Logistic Regression Model
        classifier = LogisticRegression() 
        classifier
        
        # Fit (train) or model using the training data
        classifier.fit(X_train, y_train)
        
        # Validate the model using the test data
        print(f"Training Data Score: {classifier.score(X_train, y_train)}") 
        print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
        
        # Predict the testing data point
        #predictions = classifier.predict(X_test)

        #pd.DataFrame({"Prediction": predictions, "Actual": y_test})
        print("----------------------------------------------------")
    except:
        print("An exception occurred")
        print("----------------------------------------------------")

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


(2459, 3)
(2459,)
Speed Skating
Training Data Score: 0.8969631236442517
Testing Data Score: 0.8991869918699187
----------------------------------------------------
(4443, 3)
(4443,)
Cross Country Skiing
Training Data Score: 0.9171668667466987
Testing Data Score: 0.9144914491449145
----------------------------------------------------
(3807, 3)
(3807,)
Ice Hockey
Training Data Score: 0.7348511383537654
Testing Data Score: 0.7626050420168067
----------------------------------------------------
(2808, 3)
(2808,)
Biathlon
Training Data Score: 0.9121557454890789
Testing Data Score: 0.9202279202279202
----------------------------------------------------
(3718, 3)
(3718,)
Alpine Skiing
Training Data Score: 0.9526542324246772
Testing Data Score: 0.9548387096774194
----------------------------------------------------
(1026, 3)
(1026,)
Luge
Training Data Score: 0.8621586475942783
Testing Data Score: 0.8949416342412452
----------------------------------------------------
(2062, 3)
(2062,)
Bobsleig

  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


Training Data Score: 0.6993464052287581
Testing Data Score: 0.6274509803921569
----------------------------------------------------
(512, 3)
(512,)
Snowboarding
Training Data Score: 0.9296875
Testing Data Score: 0.8671875
----------------------------------------------------
(749, 3)
(749,)
Short Track Speed Skating
Training Data Score: 0.8235294117647058
Testing Data Score: 0.7925531914893617
----------------------------------------------------
(109, 3)
(109,)
Skeleton
Training Data Score: 0.8518518518518519
Testing Data Score: 1.0
----------------------------------------------------
(0, 3)
(0,)
Military Ski Patrol
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Alpinism
An exception occurred
----------------------------------------------------


  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)
  athlete_m = athlete_m.replace("Bronze", 1)


In [15]:
# For loop to train the Logistic Regression model and get the testing score for female athletes in every Winter Olympic Sport
for sport in winter_sports:
    try:
        athlete_f = winter_data[(winter_data["Sport"] == f'{sport}') & (winter_data["Sex"] == "F")]
        athlete_f = athlete_f[["Age", "Height", "Weight", "Medal"]]
        athlete_f = athlete_f.dropna(subset=['Height', 'Weight']).reset_index(drop = True)
        
        # One-hot encoding
        athlete_f = athlete_f.replace("Gold", 1)
        athlete_f = athlete_f.replace("Silver", 1)
        athlete_f = athlete_f.replace("Bronze", 1)
        athlete_f["Medal"] = athlete_f["Medal"].fillna(0)
        athlete_f = athlete_f.dropna()
        athlete_f["Medal"].unique()
        
        # Assign X (data) and y (target)
        X = athlete_f[['Height', "Weight", "Age"]] 
        print(X.shape) 
        y = athlete_f['Medal'] 
        print(y.shape)
        print(sport)
        
        # Split our data into training and testing

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        
        # Create a Logistic Regression Model
        classifier = LogisticRegression() 
        classifier
        
        # Fit (train) or model using the training data
        classifier.fit(X_train, y_train)
        
        # Validate the model using the test data
        print(f"Training Data Score: {classifier.score(X_train, y_train)}") 
        print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
        
        # Predict the testing data point

        #predictions = classifier.predict(X_test)

        #pd.DataFrame({"Prediction": predictions, "Actual": y_test})
        print("----------------------------------------------------")
    except:
        print("An exception occurred")
        print("----------------------------------------------------")

(1941, 3)
(1941,)
Speed Skating
Training Data Score: 0.8879725085910652
Testing Data Score: 0.8497942386831275
----------------------------------------------------
(3086, 3)
(3086,)
Cross Country Skiing
Training Data Score: 0.898876404494382
Testing Data Score: 0.8937823834196891
----------------------------------------------------
(754, 3)
(754,)
Ice Hockey


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


Training Data Score: 0.6584070796460177
Testing Data Score: 0.6137566137566137
----------------------------------------------------
(1807, 3)
(1807,)
Biathlon
Training Data Score: 0.922509225092251
Testing Data Score: 0.9070796460176991
----------------------------------------------------
(2604, 3)
(2604,)
Alpine Skiing
Training Data Score: 0.9272913466461854
Testing Data Score: 0.9416282642089093
----------------------------------------------------
(351, 3)
(351,)
Luge
Training Data Score: 0.9049429657794676
Testing Data Score: 0.7727272727272727
----------------------------------------------------
(143, 3)
(143,)
Bobsleigh
Training Data Score: 0.8317757009345794
Testing Data Score: 0.8333333333333334
----------------------------------------------------
(769, 3)
(769,)
Figure Skating
Training Data Score: 0.828125
Testing Data Score: 0.8652849740932642
----------------------------------------------------
(0, 3)
(0,)
Nordic Combined
An exception occurred
--------------------------------

  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(427, 3)
(427,)
Freestyle Skiing
Training Data Score: 0.88125
Testing Data Score: 0.8785046728971962
----------------------------------------------------
(26, 3)
(26,)
Ski Jumping
Training Data Score: 1.0
Testing Data Score: 0.8571428571428571
----------------------------------------------------
(206, 3)
(206,)
Curling
Training Data Score: 0.6883116883116883
Testing Data Score: 0.5961538461538461
----------------------------------------------------
(412, 3)
(412,)
Snowboarding
Training Data Score: 0.8964401294498382
Testing Data Score: 0.8737864077669902
----------------------------------------------------


  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)
  athlete_f = athlete_f.replace("Bronze", 1)


(739, 3)
(739,)
Short Track Speed Skating
Training Data Score: 0.8267148014440433
Testing Data Score: 0.7837837837837838
----------------------------------------------------
(65, 3)
(65,)
Skeleton
Training Data Score: 0.8333333333333334
Testing Data Score: 0.7647058823529411
----------------------------------------------------
(0, 3)
(0,)
Military Ski Patrol
An exception occurred
----------------------------------------------------
(0, 3)
(0,)
Alpinism
An exception occurred
----------------------------------------------------


## Logistic Regression for male athletes in Gymnastics

In [16]:
Gymnastics_M = summer_data[(summer_data["Sport"] == "Gymnastics") & (summer_data["Sex"] == "M")]

In [17]:
Gymnastics_M = Gymnastics_M[["Age", "Height", "Weight", "Medal"]]
Gymnastics_M = Gymnastics_M.dropna(subset=['Height', 'Weight']).reset_index(drop = True)

In [18]:
Gymnastics_M = Gymnastics_M.replace("Gold", 1)
Gymnastics_M = Gymnastics_M.replace("Silver", 1)
Gymnastics_M = Gymnastics_M.replace("Bronze", 1)
Gymnastics_M["Medal"] = Gymnastics_M["Medal"].fillna(0)
Gymnastics_M = Gymnastics_M.dropna()

  Gymnastics_M = Gymnastics_M.replace("Bronze", 1)


In [19]:
# Visualizing athlete with medal vs athlete without medal

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(5, 5))
axes = Axes3D(fig, elev=20, azim=45)
axes.scatter(Gymnastics_M['Age'], Gymnastics_M['Height'], Gymnastics_M['Weight'], c=Gymnastics_M['Medal'], cmap=plt.cm.get_cmap("Spectral"))
plt.show()

  axes.scatter(Gymnastics_M['Age'], Gymnastics_M['Height'], Gymnastics_M['Weight'], c=Gymnastics_M['Medal'], cmap=plt.cm.get_cmap("Spectral"))


<Figure size 500x500 with 0 Axes>

In [20]:
a = summer_data[(summer_data["Sport"] == "Gymnastics") & (summer_data["Sex"] == "M")]
a = a[["Age", "Height", "Weight", "Medal"]].dropna().reset_index(drop = True)


a = a.replace("Gold", 1)
a = a.replace("Silver", 2)
a = a.replace("Bronze", 3)

  a = a.replace("Bronze", 3)


In [21]:
# Visualizing Gold, Silver, Bronze medalists data

fig = plt.figure(1, figsize=(5, 5))
axes = Axes3D(fig, elev=20, azim=45)
axes.scatter(a['Age'], a['Height'], a['Weight'], c=a['Medal'], cmap=plt.cm.get_cmap("Spectral"))
plt.show()

  axes.scatter(a['Age'], a['Height'], a['Weight'], c=a['Medal'], cmap=plt.cm.get_cmap("Spectral"))


<Figure size 500x500 with 0 Axes>

In [22]:
Gymnastics_M["Medal"].unique()

array([1., 0.])

In [23]:
# Assign X (data) and y (target)

X = Gymnastics_M[['Height', "Weight", "Age"]] 
print(X.shape) 
y = Gymnastics_M['Medal'] 
print(y.shape)

(10696, 3)
(10696,)


In [24]:
# Split our data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [25]:
# Create a Logistic Regression Model
classifier = LogisticRegression() 
classifier

classifier.fit(X_train, y_train)

In [26]:
# Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}") 
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")


Training Data Score: 0.9396659187235104
Testing Data Score: 0.9367988032909499


In [27]:
# Predict the testing data point

predictions = classifier.predict(X_test)

pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
3712,0.0,0.0
6778,0.0,0.0
304,0.0,0.0
3078,0.0,0.0
1597,0.0,0.0
...,...,...
4806,0.0,1.0
10662,0.0,0.0
3294,0.0,0.0
5131,0.0,0.0
