# Star Wars Character Home Prediction

## Importing required modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Getting the dataset

In [2]:
df = pd.read_csv("data/proj65/star_wars_character_dataset.csv")
df.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships
0,Luke Skywalker,172.0,77.0,blond,fair,blue,19.0,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, ...","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle"
1,C-3PO,167.0,75.0,,gold,yellow,112.0,none,masculine,Tatooine,Droid,"The Empire Strikes Back, Attack of the Clones,...",,
2,R2-D2,96.0,32.0,,"white, blue",red,33.0,none,masculine,Naboo,Droid,"The Empire Strikes Back, Attack of the Clones,...",,
3,Darth Vader,202.0,136.0,none,white,yellow,41.9,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, ...",,TIE Advanced x1
4,Leia Organa,150.0,49.0,brown,light,brown,19.0,female,feminine,Alderaan,Human,"The Empire Strikes Back, Revenge of the Sith, ...",Imperial Speeder Bike,


## Cleaning the dataset

In [3]:
df.isna().sum()

name           0
height         6
mass          28
hair_color     5
skin_color     0
eye_color      0
birth_year    44
sex            4
gender         4
homeworld     10
species        4
films          0
vehicles      76
starships     67
dtype: int64

In [4]:
df["height"].fillna(np.mean(df["height"]), inplace=True)
df["mass"].fillna(np.mean(df["mass"]), inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        87 non-null     object 
 1   height      87 non-null     float64
 2   mass        87 non-null     float64
 3   hair_color  82 non-null     object 
 4   skin_color  87 non-null     object 
 5   eye_color   87 non-null     object 
 6   birth_year  43 non-null     float64
 7   sex         83 non-null     object 
 8   gender      83 non-null     object 
 9   homeworld   77 non-null     object 
 10  species     83 non-null     object 
 11  films       87 non-null     object 
 12  vehicles    11 non-null     object 
 13  starships   20 non-null     object 
dtypes: float64(3), object(11)
memory usage: 9.6+ KB


In [6]:
df["hair_color"].value_counts()

none             37
brown            18
black            13
white             4
blond             3
brown, grey       1
auburn, white     1
auburn, grey      1
grey              1
auburn            1
blonde            1
unknown           1
Name: hair_color, dtype: int64

In [13]:
df["hair_color"].fillna("none", inplace=True)
df["birth_year"].fillna(np.mean(df["birth_year"]), inplace=True)
df["sex"].fillna("male", inplace=True)
df["gender"].fillna("masculine", inplace=True)
df["homeworld"].fillna("none", inplace=True)
df["species"].fillna("Human", inplace=True)
df["vehicles"].fillna("none", inplace=True)
df["starships"].fillna("none", inplace=True)

In [14]:
df.isna().sum()

name          0
height        0
mass          0
hair_color    0
skin_color    0
eye_color     0
birth_year    0
sex           0
gender        0
homeworld     0
species       0
films         0
vehicles      0
starships     0
dtype: int64

## Pre Processing data

In [15]:
for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        print("-"*10)
        print(column)
        print(le.classes_)
        print("-"*10)

----------
name
['Ackbar' 'Adi Gallia' 'Anakin Skywalker' 'Arvel Crynyd' 'Ayla Secura'
 'BB8' 'Bail Prestor Organa' 'Barriss Offee' 'Ben Quadinaros'
 'Beru Whitesun lars' 'Bib Fortuna' 'Biggs Darklighter' 'Boba Fett'
 'Bossk' 'C-3PO' 'Captain Phasma' 'Chewbacca' 'Cliegg Lars' 'Cordé'
 'Darth Maul' 'Darth Vader' 'Dexter Jettster' 'Dooku' 'Dormé' 'Dud Bolt'
 'Eeth Koth' 'Finis Valorum' 'Finn' 'Gasgano' 'Greedo' 'Gregar Typho'
 'Grievous' 'Han Solo' 'IG-88' 'Jabba Desilijic Tiure' 'Jango Fett'
 'Jar Jar Binks' 'Jek Tono Porkins' 'Jocasta Nu' 'Ki-Adi-Mundi'
 'Kit Fisto' 'Lama Su' 'Lando Calrissian' 'Leia Organa' 'Lobot'
 'Luke Skywalker' 'Luminara Unduli' 'Mace Windu' 'Mas Amedda' 'Mon Mothma'
 'Nien Nunb' 'Nute Gunray' 'Obi-Wan Kenobi' 'Owen Lars' 'Padmé Amidala'
 'Palpatine' 'Plo Koon' 'Poe Dameron' 'Poggle the Lesser' 'Quarsh Panaka'
 'Qui-Gon Jinn' 'R2-D2' 'R4-P17' 'R5-D4' 'Ratts Tyerell' 'Raymus Antilles'
 'Rey' 'Ric Olié' 'Roos Tarpals' 'Rugor Nass' 'Saesee Tiin' 'San Hill'
 'Sebulba

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        87 non-null     int64  
 1   height      87 non-null     float64
 2   mass        87 non-null     float64
 3   hair_color  87 non-null     int64  
 4   skin_color  87 non-null     int64  
 5   eye_color   87 non-null     int64  
 6   birth_year  87 non-null     float64
 7   sex         87 non-null     int64  
 8   gender      87 non-null     int64  
 9   homeworld   87 non-null     int64  
 10  species     87 non-null     int64  
 11  films       87 non-null     int64  
 12  vehicles    87 non-null     int64  
 13  starships   87 non-null     int64  
dtypes: float64(3), int64(11)
memory usage: 9.6 KB


In [17]:
X = df.drop("homeworld", axis=1)
y = df["homeworld"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training the model

In [20]:
rclf = RandomForestClassifier()

rclf.fit(X_train, y_train)
rclf.score(X_test, y_test)

0.1111111111111111