## Reading files into python

In [10]:
# Python library for reading raw data to python dataFrame
import pandas as pd

## Reading Csv flat files

In [11]:
# Loading the dataset using csv
data = pd.read_csv("Data/course_offerings.csv")
data

Unnamed: 0,course_id,course_name,instructor
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler
2,201,Exploratory Data Analysis,Alice Zhao
3,301,Algorithms,Chris Bruehl
4,331,Natural Language Processing,Alice Zhao


In [12]:
# Changing the separator to a pipe
data = pd.read_csv("Data/course_offerings.csv",'|')
data

  data = pd.read_csv("Data/course_offerings.csv",'|')


Unnamed: 0,"course_id,course_name,instructor"
0,"101,Intro to Python,Chris Bruehl"
1,"102,Intro to SQL,John Pauler"
2,"201,Exploratory Data Analysis,Alice Zhao"
3,"301,Algorithms,Chris Bruehl"
4,"331,Natural Language Processing,Alice Zhao"


In [13]:
# Working with separators and headers
data = pd.read_csv("Data/course_offerings.csv",sep='|',header=None)
data

Unnamed: 0,0
0,"course_id,course_name,instructor"
1,"101,Intro to Python,Chris Bruehl"
2,"102,Intro to SQL,John Pauler"
3,"201,Exploratory Data Analysis,Alice Zhao"
4,"301,Algorithms,Chris Bruehl"
5,"331,Natural Language Processing,Alice Zhao"


In [14]:
# Working with infer
data = pd.read_csv("Data/course_offerings.csv",sep=',',header="infer")
data

Unnamed: 0,course_id,course_name,instructor
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler
2,201,Exploratory Data Analysis,Alice Zhao
3,301,Algorithms,Chris Bruehl
4,331,Natural Language Processing,Alice Zhao


## Reading Excel flat files

In [15]:
# Loading the dataset using excel
data_xlsx = pd.read_excel("Data/Course Offerings.xlsx")
data_xlsx

Unnamed: 0,Instructor,Title
0,Chris Bruehl,Lead Python Instructor
1,John Pauler,Chief Growth Officer
2,Alice Zhao,Data Science Instructor


In [16]:
# Using the sheet_name parameter
data_xls = pd.read_excel("Data/Course Offerings.xlsx",sheet_name=1)
data_xls

Unnamed: 0,Course ID,Course Name,Instructor
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler
2,201,Exploratory Data Analysis,Alice Zhao
3,301,Algorithms,Chris Bruehl
4,331,Natural Language Processing,Alice Zhao


## Connecting to an SQLite Database

In [17]:
# Importing the sqlite
import sqlite3 

In [18]:
# Create a connection
cont = sqlite3.connect("Data/maven.db")

print("Connection created")

Connection created


In [19]:
# Loading the data from the database
data_sql = pd.read_sql("SELECT * FROM courses",cont)
data_sql

Unnamed: 0,course_id,course,instructor
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler
2,201,Exploratory Data Analysis,Alice Zhao
3,301,Algorithms,Chris Bruehl
4,331,Natural Language Processing,Alice Zhao


In [20]:
# Quering the database where instructor is  Alice Zhao
data_sql = pd.read_sql("SELECT * FROM courses WHERE instructor = 'Alice Zhao'",cont)
data_sql

Unnamed: 0,course_id,course,instructor
0,201,Exploratory Data Analysis,Alice Zhao
1,331,Natural Language Processing,Alice Zhao


In [21]:
# Grouping the data using instructors and the no of courses they teach
data_sql = pd.read_sql("SELECT course,instructor,COUNT(*) as courses_teach FROM courses GROUP BY instructor",cont)
data_sql

Unnamed: 0,course,instructor,courses_teach
0,Exploratory Data Analysis,Alice Zhao,2
1,Intro to Python,Chris Bruehl,2
2,Intro to SQL,John Pauler,1


In [22]:
# Getting the total number of books
data_sql = pd.read_sql("SELECT COUNT(*) as total_books FROM courses",cont)
data_sql

Unnamed: 0,total_books
0,5


In [23]:
# Getting the courses that are taught by instructors
data_sql = pd.read_sql("SELECT course FROM courses WHERE instructor = 'Alice Zhao'",cont)
data_sql

Unnamed: 0,course
0,Exploratory Data Analysis
1,Natural Language Processing


In [24]:
data_sql = pd.read_sql("SELECT course FROM courses WHERE instructor = 'Chris Bruehl'",cont)
data_sql

Unnamed: 0,course
0,Intro to Python
1,Algorithms


In [25]:
data_sql = pd.read_sql("SELECT course,instructor FROM courses GROUP BY course,instructor",cont)
data_sql

Unnamed: 0,course,instructor
0,Algorithms,Chris Bruehl
1,Exploratory Data Analysis,Alice Zhao
2,Intro to Python,Chris Bruehl
3,Intro to SQL,John Pauler
4,Natural Language Processing,Alice Zhao


## Exploring a DataFrame

In [26]:
# First five rows of a dataset
data.head()

Unnamed: 0,course_id,course_name,instructor
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler
2,201,Exploratory Data Analysis,Alice Zhao
3,301,Algorithms,Chris Bruehl
4,331,Natural Language Processing,Alice Zhao


In [27]:
# Last five rows of a dataset
data.tail()

Unnamed: 0,course_id,course_name,instructor
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler
2,201,Exploratory Data Analysis,Alice Zhao
3,301,Algorithms,Chris Bruehl
4,331,Natural Language Processing,Alice Zhao


In [28]:
# Random sample of the dataset rows
data.sample(3)

Unnamed: 0,course_id,course_name,instructor
2,201,Exploratory Data Analysis,Alice Zhao
0,101,Intro to Python,Chris Bruehl
1,102,Intro to SQL,John Pauler


In [29]:
# Shape of the data -> rows and columns
data.shape

(5, 3)

In [30]:
# Displaying the number of values in each column
data.count()

course_id      5
course_name    5
instructor     5
dtype: int64

In [31]:
# Displaying summary statistics of the dataset
data.describe()

Unnamed: 0,course_id
count,5.0
mean,207.2
std,107.829495
min,101.0
25%,102.0
50%,201.0
75%,301.0
max,331.0


In [32]:
# Displaying non-null values and data types of each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   course_id    5 non-null      int64 
 1   course_name  5 non-null      object
 2   instructor   5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes


## Gathering Data Assignment

In [33]:
# Loading the dataset
df_happy = pd.read_csv("Data/happiness_survey_data.csv")
df_happy

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Afghanistan,2008,3.723590,0.450662,0.718114,50.500000
1,Afghanistan,2009,4.401778,0.552308,0.678896,50.799999
2,Afghanistan,2010,4.758381,0.539075,0.600127,51.099998
3,Afghanistan,2011,3.831719,0.521104,0.495901,51.400002
4,Afghanistan,2012,3.782938,0.520637,0.530935,51.700001
...,...,...,...,...,...,...
2084,Zimbabwe,2017,3.638300,0.754147,0.752826,52.150002
2085,Zimbabwe,2018,3.616480,0.775388,0.762675,52.625000
2086,Zimbabwe,2019,2.693523,0.759162,0.631908,53.099998
2087,Zimbabwe,2020,3.159802,0.717243,0.643303,53.575001


In [34]:
# Geting first 5 rows of the dataFrame
df_happy.head()

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Afghanistan,2008,3.72359,0.450662,0.718114,50.5
1,Afghanistan,2009,4.401778,0.552308,0.678896,50.799999
2,Afghanistan,2010,4.758381,0.539075,0.600127,51.099998
3,Afghanistan,2011,3.831719,0.521104,0.495901,51.400002
4,Afghanistan,2012,3.782938,0.520637,0.530935,51.700001


In [35]:
# Getting the last 5 rows of the dataFrame
df_happy.tail()

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
2084,Zimbabwe,2017,3.6383,0.754147,0.752826,52.150002
2085,Zimbabwe,2018,3.61648,0.775388,0.762675,52.625
2086,Zimbabwe,2019,2.693523,0.759162,0.631908,53.099998
2087,Zimbabwe,2020,3.159802,0.717243,0.643303,53.575001
2088,Zimbabwe,2021,3.154578,0.685151,0.667636,54.049999


In [36]:
# Getting 10 random samples of the data
df_happy.sample(10)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
1097,Luxembourg,2017,7.061381,0.905436,0.902822,71.550003
984,Kosovo,2020,6.294414,0.792374,0.879838,
1012,Kyrgyzstan,2021,5.5637,0.904273,0.917871,66.849998
351,Chile,2012,6.599129,0.855236,0.733611,68.860001
280,Cambodia,2008,4.462164,0.619264,0.914173,58.32
271,Burkina Faso,2020,4.63964,0.667709,0.750226,55.275002
1973,United States,2012,7.026227,0.903192,0.822662,66.660004
44,Argentina,2009,6.424133,0.918693,0.636646,66.18
798,India,2021,3.558254,0.569733,0.866111,60.900002
2036,Vietnam,2012,5.53457,0.775009,0.856053,64.660004


In [37]:
# Displaying the no. of rows and columns
df_happy.shape

(2089, 6)

In [38]:
# Displaying the statistics of the data
df_happy.describe()

Unnamed: 0,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
count,2089.0,2089.0,2076.0,2057.0,2031.0
mean,2013.727621,5.473747,0.811542,0.745462,63.180326
std,4.455614,1.115567,0.118935,0.140751,6.948546
min,2005.0,2.178809,0.290184,0.257534,6.72
25%,2010.0,4.651972,0.747664,0.651689,58.965
50%,2014.0,5.405246,0.83477,0.767357,64.980003
75%,2017.0,6.294282,0.904682,0.857677,68.362499
max,2021.0,8.018934,0.987343,0.985178,74.349998


In [39]:
# Displaying the number of values in each column
df_happy.count()

country_name                        2089
year                                2089
happiness_score                     2089
social_support                      2076
freedom_to_make_life_choices        2057
healthy_life_expectancy_at_birth    2031
dtype: int64

In [40]:
# Displaying the non-null values
df_happy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2089 entries, 0 to 2088
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   country_name                      2089 non-null   object 
 1   year                              2089 non-null   int64  
 2   happiness_score                   2089 non-null   float64
 3   social_support                    2076 non-null   float64
 4   freedom_to_make_life_choices      2057 non-null   float64
 5   healthy_life_expectancy_at_birth  2031 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 98.0+ KB
