# Practical Data Science Cycle

1. Data Cleaning 
2. Analysis
3. Modelling 
4. Review 


# load data

In [36]:
import pandas as pd

df_train = pd.read_csv(r'train.csv')
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Data Description 

![Alt text](image.png)

# Data Cleaning

1. which feature are categorical and which are numerical?
2. Available Data Types?
3. None, Blank or empty bvalues?

---

## Which Features are Categorical and which are Numerical?

In [37]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [38]:
print("Categorical Variables")
categorical_v = df_train.select_dtypes(include=['object']).columns
for col in categorical_v:
    print(col)

Categorical Variables
PassengerId
HomePlanet
CryoSleep
Cabin
Destination
VIP
Name


In [39]:
print("Numerical Variables")
numerical_v = df_train._get_numeric_data().columns
for col in numerical_v:
    print(col)

Numerical Variables
Age
RoomService
FoodCourt
ShoppingMall
Spa
VRDeck
Transported


---

## Cleaning Nan or Empty values

In [40]:
for col in df_train.columns:
    print(col, df_train[col].isnull().sum())

PassengerId 0
HomePlanet 201
CryoSleep 217
Cabin 199
Destination 182
Age 179
VIP 203
RoomService 181
FoodCourt 183
ShoppingMall 208
Spa 183
VRDeck 188
Name 200
Transported 0


In [41]:
for col in df_train.columns:
    percentage_null = str(df_train[col].isnull().sum()/len(df_train)*100) + "%"
    print(col, percentage_null)

PassengerId 0.0%
HomePlanet 2.312205222592891%
CryoSleep 2.4962613597147127%
Cabin 2.289198205452663%
Destination 2.093638559760727%
Age 2.0591280340503855%
VIP 2.3352122397331185%
RoomService 2.082135051190613%
FoodCourt 2.105142068330841%
ShoppingMall 2.392729782583688%
Spa 2.105142068330841%
VRDeck 2.1626596111814105%
Name 2.300701714022777%
Transported 0.0%


In [42]:
len(df_train)

8693

Approaches to handle with None values:

1. Replace None values with the most common values in the column (in the case categorical variables)
2. Remove the rows that have None values
3. Replace the values with the mean (for numerical values, continuous)

For this example, we are: 

Categorical variables 
- Doing number 1 

Numerical variables 
- Doing number 3

In [43]:
# for col in df_train.columns:
#     df_train.dropna(subset=[col], inplace=True)

# df_train.head()

In [44]:
# Replace None values with the most common values in the column

for cat_v in categorical_v:
    if cat_v != 'Name':
        df_train[cat_v].fillna(df_train[cat_v].mode()[0], inplace=True)

In [45]:
for col in categorical_v:
    print(col, df_train[col].isnull().sum())

PassengerId 0
HomePlanet 0
CryoSleep 0
Cabin 0
Destination 0
VIP 0
Name 200


In [46]:
for num_col in numerical_v:
    df_train[num_col].fillna(df_train[num_col].mean(), inplace=True)

In [47]:
for num_col in numerical_v:
    print(num_col, df_train[num_col].isnull().sum())

Age 0
RoomService 0
FoodCourt 0
ShoppingMall 0
Spa 0
VRDeck 0
Transported 0


# Analysis 

1. Data Exploration 
2. Feautre Engineering 
3. Data Visualization 

## Data Exploration

In [21]:
df_train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.339054,659.739364,1594.434978,597.41744,1124.675871,1133.259049
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,78.0,118.0,45.0,89.0,71.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [20]:
df_train.describe(include=["O"])

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Name
count,8693,8693,8693,8693,8693
unique,8693,3,6560,3,8473
top,8279_01,Earth,G/734/S,TRAPPIST-1e,Alraium Disivering
freq,1,4803,207,6097,202


In [24]:
df_train[df_train["Name"]=="Alraium Disivering"]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
27,0022_01,Mars,False,D/0/P,TRAPPIST-1e,21.0,False,980.0,2.0,69.0,0.0,0.0,Alraium Disivering,False
58,0064_01,Mars,True,F/14/S,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,Alraium Disivering,True
65,0069_01,Earth,False,F/16/S,TRAPPIST-1e,42.0,False,887.0,0.0,9.0,6.0,0.0,Alraium Disivering,True
77,0082_03,Mars,False,F/16/P,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,Alraium Disivering,True
101,0108_02,Earth,False,G/19/S,TRAPPIST-1e,31.0,False,562.0,0.0,326.0,0.0,0.0,Alraium Disivering,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8629,9205_02,Europa,True,B/300/P,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,Alraium Disivering,True
8631,9208_01,Earth,True,G/1485/S,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,Alraium Disivering,True
8636,9218_01,Europa,True,B/353/S,55 Cancri e,43.0,False,0.0,0.0,0.0,0.0,0.0,Alraium Disivering,True
8652,9230_01,Europa,False,C/342/S,TRAPPIST-1e,36.0,True,0.0,5600.0,715.0,2868.0,971.0,Alraium Disivering,True
