# Feature Engineering

In [47]:
import pandas as pd

In [48]:
# Read in sumo csv data
sumo_df = pd.read_csv('../data/sumo_df.csv')

In [49]:
sumo_df.shape

(645, 11)

In [50]:
sumo_df.head()

Unnamed: 0,id,ring_name,current_rank,heya,birthday,height,weight,debut,total_wins,total_losses,total_matches
0,218,Dairaido,Sandanme 27 East,Takadagawa,1980-04-17T00:00:00Z,177.0,151.0,199603,619,580,1199
1,299,Kayatoiwa,Sandanme 70 East,Minato,1991-07-13T00:00:00Z,176.0,103.5,200703,337,347,684
2,36,Myogiryu,Juryo 8 West,Sakaigawa,1990-02-27T15:00:00Z,188.0,154.0,200905,601,597,1198
3,275,Oshozan,Sandanme 48 West,Naruto,2000-05-11T00:00:00Z,168.0,123.0,201603,164,153,317
4,614,Daitengu,Jonidan 67 East,Takadagawa,2004-06-17T00:00:00Z,177.0,93.0,202301,28,35,63


### Checking For Null Values

In [51]:
# Only one NaN for several features in dataframe.  Suspect it may be one rikishi
sumo_df.isnull().sum()

id               0
ring_name        0
current_rank     0
heya             1
birthday         1
height           1
weight           1
debut            0
total_wins       0
total_losses     0
total_matches    0
dtype: int64

In [52]:
# Only one rikishi (row) has all four NaNs.  Looks like a new rikishi
# Will drop this row
sumo_df[sumo_df.isna().any(axis = 1)]

Unnamed: 0,id,ring_name,current_rank,heya,birthday,height,weight,debut,total_wins,total_losses,total_matches
591,8889,Anhibiki,Sandanme 62 East,,,,,202401,21,7,28


In [53]:
# Drop all NaNss - only one row will be removed
sumo_df.dropna(inplace = True)

### Ring Names

#### Interesting that there are multiple entries for the same ring name, even though they are supposed to be unique

In [57]:
sumo_df['ring_name'].value_counts()

ring_name
Togyokuko     2
Daishinkai    2
Shunrai       1
Tamatensho    1
Tsuyasato     1
             ..
Kaorufuji     1
Daishomaru    1
Sakurai       1
Tokitora      1
Kazuma        1
Name: count, Length: 642, dtype: int64

In [58]:
# Turns out there looks like a duplicate entry for the same rikishi.  
# Verified names, current heights, and current weights on sumodb.com
sumo_df[sumo_df['ring_name'] == 'Togyokuko']

Unnamed: 0,id,ring_name,current_rank,heya,birthday,height,weight,debut,total_wins,total_losses,total_matches
564,8861,Togyokuko,Jonidan 78 West,Tamanoi,2007-08-29T00:00:00Z,172.0,114.0,202305,22,30,52
588,8886,Togyokuko,Jonidan 86 East,Tamanoi,2007-08-29T00:00:00Z,172.0,119.3,202311,0,0,0


In [59]:
# Row 179 is more accurate.  Weight, height, and birthday are verified via sumo association
sumo_df[sumo_df['ring_name'] == 'Daishinkai']

Unnamed: 0,id,ring_name,current_rank,heya,birthday,height,weight,debut,total_wins,total_losses,total_matches
15,8175,Daishinkai,Jonidan 30 East,Otake,1992-12-24T15:00:00Z,180.0,167.0,201903,41,65,106
179,217,Daishinkai,Sandanme 22 West,Otake,2000-04-25T00:00:00Z,174.0,135.0,201903,103,93,196


In [60]:
# THe tilde switches the boolean statement so the code will mark the two rows as 'False' and not include them in the 
# updated dataframe
sumo_df = sumo_df[~sumo_df['id'].isin([8886, 8175])]

### Rank Categories

#### To simply the ranking system, which inlcudes numerical and east/west rankings within the categories, I'll only consider the main categories.  There are ten ranks total. There are fixed number of rikishi allowed above rank Jonidan.  This includes:

#### Sandame: 200
#### Makushita: 120
#### Juryo: 28
#### Maegashira: 32
#### Komusubi: At least two
#### Sekiwake: At least two
#### Ozeki: At least two
#### Yokozuna: Top position - No limit - 73 total rikishi have reached this rank in history of sumo



#### There shouldn't be more than one entry for each full ranking (except for Banzuke-gai but the dataset has up to three entries for some of the ranks.  Ranks change every other month after a tournament, so this information changes frequently, esepecially within lower ranks.  This feature will not be used for modeling, but may be interesting to visualize the higher ranks.

In [109]:
sumo_df['current_rank'].value_counts(ascending = False)

current_rank
Banzuke-gai          9
Jonokuchi 14 East    3
Jonokuchi 2 West     3
Jonokuchi 18 East    2
Sandanme 22 West     2
                    ..
Jonidan 46 East      1
Makushita 36 West    1
Sandanme 61 East     1
Makushita 8 East     1
Jonokuchi 16 East    1
Name: count, Length: 601, dtype: int64

In [110]:
# Only one active Yokozuna currently
sumo_df[sumo_df['current_rank'].str.contains('Yoko')]

Unnamed: 0,id,ring_name,current_rank,heya,birthday,height,weight,debut,total_wins,total_losses,total_matches,rank_category
408,45,Terunofuji,Yokozuna 1 East,Isegahama,1993-06-26T15:00:00Z,192.0,176.0,201101,521,272,793,Yokozuna


In [95]:
# Create a lambda function that iterates through current_rank column and returns only rank category
# For example, 'Sandanme 27 East' returns 'Sandame'
sumo_df['rank_category'] = sumo_df['current_rank'].map(lambda x: x.split(' ')[0])

In [96]:
sumo_df.head()

Unnamed: 0,id,ring_name,current_rank,heya,birthday,height,weight,debut,total_wins,total_losses,total_matches,rank_category
0,218,Dairaido,Sandanme 27 East,Takadagawa,1980-04-17T00:00:00Z,177.0,151.0,199603,619,580,1199,Sandanme
1,299,Kayatoiwa,Sandanme 70 East,Minato,1991-07-13T00:00:00Z,176.0,103.5,200703,337,347,684,Sandanme
2,36,Myogiryu,Juryo 8 West,Sakaigawa,1990-02-27T15:00:00Z,188.0,154.0,200905,601,597,1198,Juryo
3,275,Oshozan,Sandanme 48 West,Naruto,2000-05-11T00:00:00Z,168.0,123.0,201603,164,153,317,Sandanme
4,614,Daitengu,Jonidan 67 East,Takadagawa,2004-06-17T00:00:00Z,177.0,93.0,202301,28,35,63,Jonidan


### Heya

In [111]:
sumo_df['heya'].value_counts()

heya
Isegahama      27
Kokonoe        25
Nishonoseki    25
Sadogatake     24
Oitekaze       23
Takasago       22
Sakaigawa      21
Kise           21
Tamanoi        20
Tokitsukaze    19
Takadagawa     19
Dewanoumi      18
Tatsunami      18
Hakkaku        17
Shikoroyama    17
Naruto         17
Kasugano       16
Onomatsu       16
Musashigawa    16
Futagoyama     16
Isenoumi       15
Arashio        15
Shikihide      15
Fujishima      14
Onoe           14
Miyagino       14
Yamahibiki     13
Otake          13
Tagonoura      13
Asakayama      12
Nishiiwa       11
Hanaregoma     10
Tokiwayama      9
Oshima          9
Takekuma        8
Minato          8
Shibatayama     8
Asahiyama       7
Michinoku       7
Oshiogawa       7
Irumagawa       6
Ajigawa         5
Kataonami       4
Nishikido       4
Otowayama       3
Ikazuchi        1
Name: count, dtype: int64