# Prepping data for import to an SQLite DB
In this guided project, we were given a CSV containing information about academy award nominations. The goal was to apply SQLite knowledge gained from prior lessons to imported the cleaned data into a database.

In [81]:
import pandas as pd

df = pd.read_csv('academy_awards.csv', encoding='latin1')

df

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2010 (83rd),Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,
1,2010 (83rd),Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,
2,2010 (83rd),Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,
3,2010 (83rd),Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},YES,,,,,,
4,2010 (83rd),Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},NO,,,,,,
5,2010 (83rd),Actor -- Supporting Role,Christian Bale,The Fighter {'Dicky Eklund'},YES,,,,,,
6,2010 (83rd),Actor -- Supporting Role,John Hawkes,Winter's Bone {'Teardrop'},NO,,,,,,
7,2010 (83rd),Actor -- Supporting Role,Jeremy Renner,The Town {'James Coughlin'},NO,,,,,,
8,2010 (83rd),Actor -- Supporting Role,Mark Ruffalo,The Kids Are All Right {'Paul'},NO,,,,,,
9,2010 (83rd),Actor -- Supporting Role,Geoffrey Rush,The King's Speech {'Lionel Logue'},NO,,,,,,


# Clean the Data
A couple of cleaning tasks need to be completed before the data can be imported to the SQL database. The year number needs to be extracted and it's data type converted. The project required that we only import nominations that were later than 2000 and only nominations in the actor/actress categories.

Useless columns will be dropped and the data in the Additional Info column(in this case, movie names and the actor's character name) needs to be extracted.

In [82]:
# Remove extraneous information in Year, keep only the year.
df['Year'] = df['Year'].str[0:4]
df['Year'] = df['Year'].astype('int64')

# Restrict dataframe to only years above 2000 and actor award categories
later_than_2000 = df.loc[:, 'Year'] > 2000
award_categories = [    'Actor -- Leading Role',
                        'Actor -- Supporting Role',
                        'Actress -- Leading Role',
                        'Actress -- Supporting Role']

df = df.loc[later_than_2000, :]
df['Category'].isin(award_categories)
mask = df['Category'].isin(award_categories)
nomination = df.loc[mask,:]

nominations.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Won
0,2010.0,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,,0.0
1,2010.0,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,,0.0
2,2010.0,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,,0.0
3,2010.0,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},YES,,,,,,,1.0
4,2010.0,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},NO,,,,,,,0.0


In [83]:
#Prep Win? column for SQL
replace_dict = {"YES": 1, "NO": 0 }
nominations['Won'] = nominations.loc[:,'Won?'].map(replace_dict)

# Remove useless columns
drop_these = ['Won?', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
final_nominations = nominations.drop(drop_these, axis=1)
final_nominations.reset_index(inplace=True)
final_nominations.drop(200, inplace=True)

final_nominations.head()

Unnamed: 0,index,Year,Category,Nominee,Additional Info,Won
0,0,2010.0,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},0.0
1,1,2010.0,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},0.0
2,2,2010.0,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},0.0
3,3,2010.0,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},1.0
4,4,2010.0,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},0.0


In [84]:
# Clean Additional Info data for a better SQL table
additional_info_one = final_nominations['Additional Info'].str.rstrip("'}")
additional_info_two = additional_info_one.str.split("{'")
movie_names = additional_info_two.str[0]
characters = additional_info_two.str[1]

# Add cleaned data into their own columns, remove the old one.
final_nominations['Movie'] = movie_names
final_nominations['Character'] = characters
final_nominations.drop('Additional Info', axis=1, inplace=True)

final_nominations.head()

Unnamed: 0,index,Year,Category,Nominee,Won,Movie,Character
0,0,2010.0,Actor -- Leading Role,Javier Bardem,0.0,Biutiful,Uxbal
1,1,2010.0,Actor -- Leading Role,Jeff Bridges,0.0,True Grit,Rooster Cogburn
2,2,2010.0,Actor -- Leading Role,Jesse Eisenberg,0.0,The Social Network,Mark Zuckerberg
3,3,2010.0,Actor -- Leading Role,Colin Firth,1.0,The King's Speech,King George VI
4,4,2010.0,Actor -- Leading Role,James Franco,0.0,127 Hours,Aron Ralston


# Import to SQL Database
In the next step SQLite3 will be imported. A connection will be created and the pandas dataframe will be imported in (currently commented out to avoid issues with importing the data multiple times).

Next, two queries will be sent to the database. First for a description of the table that was created. Second, for the first five entries.

In [85]:
import sqlite3
from pprint import pprint

# Create a .db file and use pandas to send the DataFrame to it. 
conn = sqlite3.connect('nominations.db')
#final_nominations.to_sql("nominations", conn, index=False)

# Show us that the columns imported correctly
table_info = conn.execute('PRAGMA table_info(nominations);').fetchall()
pprint(table_info)

# Browse the first ten rows
ten_rows = conn.execute('SELECT * FROM nominations LIMIT 5').fetchall()
pprint(ten_rows)

conn.close()

[(0, 'index', 'INTEGER', 0, None, 0),
 (1, 'Year', 'REAL', 0, None, 0),
 (2, 'Category', 'TEXT', 0, None, 0),
 (3, 'Nominee', 'TEXT', 0, None, 0),
 (4, 'Won', 'REAL', 0, None, 0),
 (5, 'Movie', 'TEXT', 0, None, 0),
 (6, 'Character', 'TEXT', 0, None, 0)]
[(0,
  2010.0,
  'Actor -- Leading Role',
  'Javier Bardem',
  0.0,
  'Biutiful ',
  'Uxbal'),
 (1,
  2010.0,
  'Actor -- Leading Role',
  'Jeff Bridges',
  0.0,
  'True Grit ',
  'Rooster Cogburn'),
 (2,
  2010.0,
  'Actor -- Leading Role',
  'Jesse Eisenberg',
  0.0,
  'The Social Network ',
  'Mark Zuckerberg'),
 (3,
  2010.0,
  'Actor -- Leading Role',
  'Colin Firth',
  1.0,
  "The King's Speech ",
  'King George VI'),
 (4,
  2010.0,
  'Actor -- Leading Role',
  'James Franco',
  0.0,
  '127 Hours ',
  'Aron Ralston')]
