# Data Cleaning and EDA 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set(color_codes=True)
%matplotlib inline 

starwars = pd.read_csv("star_wars.csv", encoding="ISO-8859-1") 
print(starwars.head(10))
print(starwars.columns)
print(starwars.shape)
#(1187, 38) - 38 columns 

IOError: File star_wars.csv does not exist

In [None]:
id = starwars["RespondentID"]
print(id.shape[0])
id_not_null = pd.notnull(id)
id_is_not_null = id[id_not_null]
print(id_is_not_null)
print(id_is_not_null.shape[0])


In [None]:
starwars["RespondentID"].dtype 

In [None]:
starwars = starwars.loc[id_not_null] 
#If you want to remove the missing value on a dataframe based on certain Columns, apply 1) pd.notnull or pd.isnull and 
#2) put the value (i.e. True or False) back to the dataframe with loc.  

In [None]:
print(starwars["Have you seen any of the 6 films in the Star Wars franchise?"].value_counts()) 
print(starwars["Do you consider yourself to be a fan of the Star Wars film franchise?"].value_counts()) 
print(set(starwars["Have you seen any of the 6 films in the Star Wars franchise?"]))
print(set(starwars["Do you consider yourself to be a fan of the Star Wars film franchise?"]))

In [None]:
yes_no = {"Yes":True, "No":False} 
for col in ["Have you seen any of the 6 films in the Star Wars franchise?",
            "Do you consider yourself to be a fan of the Star Wars film franchise?"]: 
    starwars[col] = starwars[col].map(yes_no) 
    #If you chance the column into different values, you have to assign that column back to the original column, 
    #Not the entire dataset  
print(starwars.head())

In [None]:
sns.countplot(x=starwars["Do you consider yourself to be a fan of the Star Wars film franchise?"])


In [None]:
print(set(starwars["Which of the following Star Wars films have you seen? Please select all that apply."])) 
print(set(starwars["Unnamed: 4"])) 
print(set(starwars["Unnamed: 5"])) 
print(set(starwars["Unnamed: 6"]))
print(set(starwars["Unnamed: 7"]))
print(set(starwars["Unnamed: 8"])) 

In [None]:
titles = {'Star Wars: Episode I  The Phantom Menace':True, 
         np.nan:False,
         'Star Wars: Episode II  Attack of the Clones':True,
         'Star Wars: Episode III  Revenge of the Sith':True,
         'Star Wars: Episode IV  A New Hope':True, 
         'Star Wars: Episode V The Empire Strikes Back':True, 
         'Star Wars: Episode VI Return of the Jedi':True  
         }
for col in starwars.columns[3:9]:
    starwars[col] = starwars[col].map(titles)
print(starwars.head())
# The key is to iterate around the name of the columns, and inside the for loop, you re-engineer the mapping selecting 
#entire columns. Also it's important to put the modified columns back to the original columns NOT THE entire tableset.

In [None]:
starwars = starwars.rename(columns={"Which of the following Star Wars films have you seen? Please select all that apply.":"seen1"})
#inside the rename, the parameter has to be columns, not col or cols 
print(starwars.columns)

In [None]:
starwars = starwars.rename(columns={'Unnamed: 4':"seen2"})
starwars = starwars.rename(columns={'Unnamed: 5':"seen3"})
starwars = starwars.rename(columns={'Unnamed: 6':"seen4"})
starwars = starwars.rename(columns={'Unnamed: 7':"seen5"})
starwars = starwars.rename(columns={'Unnamed: 8':"seen6"})
print(starwars.columns)

In [None]:
starwars["Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film."]

In [None]:
starwars[starwars.columns[9:15]] = starwars[starwars.columns[9:15]].astype(float)
starwars[starwars.columns[9:15]].head()  
starwars = starwars.rename(columns={"Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.":"ranking_1"})
starwars = starwars.rename(columns={"Unnamed: 10":"ranking_2"})
starwars = starwars.rename(columns={"Unnamed: 11":"ranking_3"})
starwars = starwars.rename(columns={"Unnamed: 12":"ranking_4"})
starwars = starwars.rename(columns={"Unnamed: 13":"ranking_5"})
starwars = starwars.rename(columns={"Unnamed: 14":"ranking_6"})
starwars.columns

In [None]:
starwars.columns[9]
vector = starwars['ranking_1'] 
vector.mean()

In [None]:
starwars_mean = []
for col in starwars.columns[9:15]: 
    starwars_mean.append(starwars[col].mean())
print(starwars_mean)
#You don't need to assign the appended value back to starwars_mean

In [None]:
from numpy import arange 
fig, ax = plt.subplots() 
bar_positions = arange(6) + 0.75 
name_cols = ["SW1","SW2","SW3","SW4","SW5","SW6"]
tick_positions = range(1,7)

ax.bar(bar_positions, starwars_mean, 0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(name_cols, rotation=90)
ax.set_xlabel("StarWars Series Names")
ax.set_ylabel("Average Ratings")
ax.set_title("Average Ratings for StarWars Series")
plt.show()


StarWars: Return of the Jedi has the highest rating (4.34). StarWars: EpisodeII Attack of the Clones has the lowest rating(2.51). 

In [None]:
starwars.columns[3]

starwars_seen = [] 
for col in starwars.columns[3:9]: 
    starwars_seen.append(starwars[col].sum()) 
print(starwars_seen)

fig, ax = plt.subplots()
bar_positions = arange(6) + 0.75
name_columns = ["SW1","SW2","SW3","SW4","SW5","SW6"]
tick_positions = range(1,7)

ax.bar(bar_positions, starwars_seen, 0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(name_columns,rotation=90)
ax.set_xlabel("StarWars Series")
ax.set_ylabel("Sum of watching series")
ax.set_title("Sum of people watched each series")


StarWars: EpisodeII Attack of the Clones is the most watched title(total 758). StarWars: Return of the Jedi is the least watched title(550). Interestingly, the most watched title has the least average rating although the least watched title has the highest average rating. 

In [None]:
starwars_favorably = {"Very favorably":6,
                     "Somewhat favorably":5,
                     "Neither favorably nor unfavorably (neutral)":4,
                     "Unfamiliar (N/A)":3,
                     "Somewhat unfavorably":2,
                     "Very unfavorably":1}
for col in starwars.columns[15:29]: 
    starwars[col] = starwars[col].map(starwars_favorably).astype(float)

starwars = starwars.rename(columns={"Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.":"Han Solo"}) 
starwars = starwars.rename(columns={"Unnamed: 16":"Luke Skywalker"}) 
starwars = starwars.rename(columns={"Unnamed: 17":"Princess Leia Organa"})
starwars = starwars.rename(columns={"Unnamed: 18":"Anakin Skywalker"})
starwars = starwars.rename(columns={"Unnamed: 19":"Obi Wan Kenobi"})
starwars = starwars.rename(columns={"Unnamed: 20":"Emperor Palpatine"})
starwars = starwars.rename(columns={"Unnamed: 21":"Darth Vader"})
starwars = starwars.rename(columns={"Unnamed: 22":"Lando Calrissian"})
starwars = starwars.rename(columns={"Unnamed: 23":"Boba Fett"})
starwars = starwars.rename(columns={"Unnamed: 24":"C-3P0"})
starwars = starwars.rename(columns={"Unnamed: 25":"R2 D2"})
starwars = starwars.rename(columns={"Unnamed: 26":"Jar Jar Binks"})
starwars = starwars.rename(columns={"Unnamed: 27":"Padme Amidala"})
starwars = starwars.rename(columns={"Unnamed: 28":"Yoda"})

In [None]:
starwars.describe() #Numerican values 

In [None]:
starwars.describe(include=['O']) #Categorical values 

It seems like Jar Jar Binks is the least popular character among the listed Star Wars characters. 

In [None]:
starwars["Location (Census Region)"].value_counts(normalize=True)

In [None]:
starwars[["Have you seen any of the 6 films in the Star Wars franchise?","Location (Census Region)"]].groupby("Location (Census Region)").mean()

In [None]:
starwars["Do you consider yourself to be a fan of the Star Wars film franchise?"] = starwars["Do you consider yourself to be a fan of the Star Wars film franchise?"].astype(bool)

In [None]:
starwars.columns 

In [None]:
starwars["Household Income"].value_counts()

In [None]:
starwars["Education"].value_counts()

# Gender

Let's analyze the StarWars data based on gender binary category. 

In [None]:
starwars["Gender"].value_counts() 
#More female than men 

In [None]:
sns.barplot(x="Gender",y="Do you consider yourself to be a fan of the Star Wars film franchise?", hue="Age", data=starwars)
#Male are more likely to be a fan of StarWars compared to female

In [None]:
starwars["Do you consider yourself to be a fan of the Star Wars film franchise?"].mean()

# StarTrek fan 

In [None]:
starwars["Do you consider yourself to be a fan of the Star Trek franchise?"].value_counts(normalize=True) 
#More people are not a fan of Star Trek (60%)  

In [None]:
starwars["Do you consider yourself to be a fan of the Star Trek franchise?"] = starwars["Do you consider yourself to be a fan of the Star Trek franchise?"].map({"Yes":True, "No":False})

In [None]:
starwars["Do you consider yourself to be a fan of the Star Trek franchise?"].value_counts()

In [None]:
startrek_true = starwars[starwars["Do you consider yourself to be a fan of the Star Trek franchise?"] == True] 
startrek_false = starwars[starwars["Do you consider yourself to be a fan of the Star Trek franchise?"] == False] 

In [None]:
startrek_true_seen = [] 
for col in startrek_true.columns[3:9]:
    startrek_true_seen.append(startrek_true[col].sum())
print("StarTrek fan", startrek_true_seen)

startrek_false_seen = [] 
for col in startrek_false.columns[3:9]: 
    startrek_false_seen.append(startrek_false[col].sum())
print("Non-StarTrek fan", startrek_false_seen)
#How to visualize this results? 

In [None]:
starwars.describe()

# Region 

Census Regions 
---------------------------------------------------------------
1: East North Central: Indiana, Illinoi, Michigan, Ohio, Wisconsin 
2: Pacific: Alaska, Washington, California, Oregon, Hawaii            
3: South Atlantic: Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina,Virginia 
4: West Virginia          
5: Middle Atlantic: New Jersy, New York, Pennsilvania         
6: West South Central: Arkansas, Louisiana, Oklahoma, Texas  
7: West North Central: Iowa, Nebraska, Kansas, NorthDakota, Minnesota, South Dakota, Missouri  
8: Mountain: Arizona, Colorado, Idaho, New Mexico          
9: New England: Mass, Main, Vermont, New Hampsher, Rhoad Island, Conneticat             
10: East South Central: Alabama, Kentucky, Mississippi, Tennessee

In [None]:
location_null = pd.isnull(starwars["Location (Census Region)"])
location_good = starwars["Location (Census Region)"][location_null == False] 
print(location_good)

In [None]:
starwars["Location (Census Region)"].value_counts() 

In [None]:
starwars[["Do you consider yourself to be a fan of the Star Wars film franchise?","Location (Census Region)"]].groupby("Location (Census Region)").mean().sort_values("Do you consider yourself to be a fan of the Star Wars film franchise?", ascending=0)
#West South Central(Arkansas, Louisiana, Oklahoma, Texas) has the highest average of being a Star Wars fan 
#Pacific region(Alaska, Washington, California, Oregon, Hawaii) is the lowest 

# Age

In [None]:
starwars["Age"].value_counts(normalize=True)

In [None]:
sns.countplot(x="Do you consider yourself to be a fan of the Star Wars film franchise?", hue="Age", data=starwars, palette="RdBu")