# Star Wars Dataset

## Task 1: Find The Tallest and Second Tallest Star Wars Character

In [1]:
# imports
import warnings
import pandas as pd

# read in the csv and turn it into a dataframe so the data could be manipulated
df = pd.read_csv('characters.csv')
df.head(20)

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,species
0,Luke Skywalker,172.0,77.0,blond,fair,blue,19BBY,male,Tatooine,Human
1,C-3PO,167.0,75.0,,gold,yellow,112BBY,,Tatooine,Droid
2,R2-D2,96.0,32.0,,"white, blue",red,33BBY,,Naboo,Droid
3,Darth Vader,202.0,136.0,none,white,yellow,41.9BBY,male,Tatooine,Human
4,Leia Organa,150.0,49.0,brown,light,brown,19BBY,female,Alderaan,Human
5,Owen Lars,178.0,120.0,"brown, grey",light,blue,52BBY,male,Tatooine,Human
6,Beru Whitesun lars,165.0,75.0,brown,light,blue,47BBY,female,Tatooine,Human
7,R5-D4,97.0,32.0,,"white, red",red,,,Tatooine,Droid
8,Biggs Darklighter,183.0,84.0,black,light,brown,24BBY,male,Tatooine,Human
9,Obi-Wan Kenobi,182.0,77.0,"auburn, white",fair,blue-gray,57BBY,male,Stewjon,Human


In [2]:
height_only = df[['name','height']]
height_only.head()

Unnamed: 0,name,height
0,Luke Skywalker,172.0
1,C-3PO,167.0
2,R2-D2,96.0
3,Darth Vader,202.0
4,Leia Organa,150.0


In [3]:
ordered_height = height_only.sort_values('height', ascending=False)
ordered_height.head(2)

Unnamed: 0,name,height
53,Yarael Poof,264.0
77,Tarfful,234.0


Answer: the tallest character is Yarael Poof, and the second tallest is Tarfful

## Task 2: Are there More Male or Female Characters?

In [52]:
# getting only the columns we need
genders = df[['name','gender']]
genders.head()

Unnamed: 0,name,gender
0,Luke Skywalker,male
1,C-3PO,
2,R2-D2,
3,Darth Vader,male
4,Leia Organa,female


In [53]:
# getting rid of null values
genders = genders.dropna()
genders.head(21)

Unnamed: 0,name,gender
0,Luke Skywalker,male
3,Darth Vader,male
4,Leia Organa,female
5,Owen Lars,male
6,Beru Whitesun lars,female
8,Biggs Darklighter,male
9,Obi-Wan Kenobi,male
10,Anakin Skywalker,male
11,Wilhuff Tarkin,male
12,Chewbacca,male


In [54]:
# getting a count of all genders(that's counting non-humans and droids)
genders.groupby(["gender"]).count()

Unnamed: 0_level_0,name
gender,Unnamed: 1_level_1
female,19
hermaphrodite,1
male,62
none,2


In [60]:
# this is just the count of males and females
males_only = genders[genders['gender']=='male']
females_only = genders[genders['gender']=='female']
female_total = len(females_only.index)
male_total = len(males_only.index)

print(f"There are {female_total} females and {male_total} males")

There are 19 females and 62 males


Answer: There are more male than female characters 

## Task 3: Find the Oldest and Second Oldest Characters

In [61]:
# turn ages into a factor like in R
ages = df[['name','birth_year']]
# theres no point in counting the NAs in this
ages.dropna(inplace=True)
ages

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ages.dropna(inplace=True)


Unnamed: 0,name,birth_year
0,Luke Skywalker,19BBY
1,C-3PO,112BBY
2,R2-D2,33BBY
3,Darth Vader,41.9BBY
4,Leia Organa,19BBY
5,Owen Lars,52BBY
6,Beru Whitesun lars,47BBY
8,Biggs Darklighter,24BBY
9,Obi-Wan Kenobi,57BBY
10,Anakin Skywalker,41.9BBY


In [62]:
# seeing if there's anything other than BBY(BBY is similar to BC and AD)
ages.birth_year.unique()

array(['19BBY', '112BBY', '33BBY', '41.9BBY', '52BBY', '47BBY', '24BBY',
       '57BBY', '64BBY', '200BBY', '29BBY', '44BBY', '600BBY', '21BBY',
       '896BBY', '82BBY', '31.5BBY', '15BBY', '53BBY', '31BBY', '37BBY',
       '41BBY', '48BBY', '8BBY', '92BBY', '91BBY', '62BBY', '72BBY',
       '54BBY', '22BBY', '58BBY', '40BBY', '102BBY', '67BBY', '66BBY',
       '46BBY'], dtype=object)

In [63]:
# since every character in this data set was born BBY and no one was born ABY
# there's no need to keep that in the data set so for now we'll get rid of it because we 
# can assume it's there
ages['birth_year']=ages['birth_year'].replace('BBY','',regex=True).astype(float)
ages
#this warning can be ignored for now because I don't plan on doing anything 
# outside of this notebook

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ages['birth_year']=ages['birth_year'].replace('BBY','',regex=True).astype(float)


Unnamed: 0,name,birth_year
0,Luke Skywalker,19.0
1,C-3PO,112.0
2,R2-D2,33.0
3,Darth Vader,41.9
4,Leia Organa,19.0
5,Owen Lars,52.0
6,Beru Whitesun lars,47.0
8,Biggs Darklighter,24.0
9,Obi-Wan Kenobi,57.0
10,Anakin Skywalker,41.9


In [64]:
# So this is just if I wanted the oldest
ages.max()

name           Yoda
birth_year    896.0
dtype: object

In [69]:
# this is everyone's age from oldest(top) to youngest(bottom)
ages.sort_values('birth_year', ascending=False)

Unnamed: 0,name,birth_year
18,Yoda,896.0
15,Jabba Desilijic Tiure,600.0
12,Chewbacca,200.0
1,C-3PO,112.0
63,Dooku,102.0
30,Qui-Gon Jinn,92.0
48,Ki-Adi-Mundi,92.0
32,Finis Valorum,91.0
19,Palpatine,82.0
58,Cliegg Lars,82.0


## Task 4: Find and Count All unique Homeworlds

In [70]:
# Just getting the columns we need and excluding things that we don't need, like age and gender
home_planets = df[['name','homeworld']]
home_planets.head()

Unnamed: 0,name,homeworld
0,Luke Skywalker,Tatooine
1,C-3PO,Tatooine
2,R2-D2,Naboo
3,Darth Vader,Tatooine
4,Leia Organa,Alderaan


In [82]:
# this is a count of all unique homeworlds
home_planets['homeworld'].count()

77

In [83]:
# So this is a count of how many people are from each home world
home_planets['count'] = 1
planet_count = home_planets.sort_values('count', ascending=False).drop_duplicates(['homeworld'])
planet_count = home_planets.groupby(['homeworld']).sum().reset_index()[['homeworld','count']]
planet_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_planets['count'] = 1


Unnamed: 0,homeworld,count
0,Alderaan,3
1,Aleen Minor,1
2,Bespin,1
3,Bestine IV,1
4,Cato Neimoidia,1
5,Cerea,1
6,Champala,1
7,Chandrila,1
8,Concord Dawn,1
9,Corellia,2


## Task 5: What eye color is most common and what is the most dominant eye color?

In [128]:
eye_colors = pd.DataFrame(df['eye_color'])
eye_colors.head()

Unnamed: 0,eye_color
0,blue
1,yellow
2,red
3,yellow
4,brown


In [143]:
# Get's all the unique values in the eye_color column and counts them
# variable to hold the count
unique_values = 0
  
# list to hold visited values
visited = []
  
# loop for counting the unique
# values in height
for i in range(0, len(eye_colors['eye_color'])):
    
    if eye_colors['eye_color'][i] not in visited: 
        
        visited.append(eye_colors['eye_color'][i])
          
        unique_values += 1
  
print("Number of unique values :",
      unique_values)
  

Number of unique values : 15


In [152]:
eye_colors['count'] = 1
eye_color_count = eye_colors.sort_values('count', ascending=False).drop_duplicates(['eye_color'])
eye_color_count = eye_colors.groupby(['eye_color']).sum().reset_index()[['eye_color','count']]
eye_color_count = eye_color_count.sort_values('count',ascending=False)
eye_color_count.head(1)


Unnamed: 0,eye_color,count
3,brown,21


the most dominant eye color is brown. 