# Introduction to Python and Machine Learning 
### By Chukwuma Anthony Nwachukwu
### anthonynwachukwu19@gmail.com

In [1]:
#importing libraries
import pandas as pd, numpy as np, warnings, matplotlib.pyplot as plt, seaborn as sns 
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
#importing dataset
data=pd.read_csv('Stage_A.csv', encoding="latin-1")

In [3]:
data.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y2014,Y2015,Y2016,Y2017,Y2018
0,4,Algeria,2501,Population,511,Total Population - Both sexes,1000 persons,38924.0,39728.0,40551.0,41389.0,42228.0
1,4,Algeria,2501,Population,5301,Domestic supply quantity,1000 tonnes,0.0,0.0,0.0,0.0,0.0
2,4,Algeria,2901,Grand Total,664,Food supply (kcal/capita/day),kcal/capita/day,3377.0,3379.0,3372.0,3341.0,3322.0
3,4,Algeria,2901,Grand Total,674,Protein supply quantity (g/capita/day),g/capita/day,94.9,94.35,94.72,92.82,91.83
4,4,Algeria,2901,Grand Total,684,Fat supply quantity (g/capita/day),g/capita/day,80.06,79.36,77.4,80.19,77.28


## Data Preprocessing

In [4]:
data.drop(data.columns[[0,2,6]],axis=1, inplace = True)

In [5]:
##Making all features lowercase
data.columns = data.columns.str.lower()
## renaming features
data.rename(columns={"y2014":'2014',"y2015":'2015','y2016':'2016','y2017':'2017','y2018':'2018',"element code" : "element_code"},inplace=True)


In [6]:
data.head()

Unnamed: 0,area,item,element_code,element,2014,2015,2016,2017,2018
0,Algeria,Population,511,Total Population - Both sexes,38924.0,39728.0,40551.0,41389.0,42228.0
1,Algeria,Population,5301,Domestic supply quantity,0.0,0.0,0.0,0.0,0.0
2,Algeria,Grand Total,664,Food supply (kcal/capita/day),3377.0,3379.0,3372.0,3341.0,3322.0
3,Algeria,Grand Total,674,Protein supply quantity (g/capita/day),94.9,94.35,94.72,92.82,91.83
4,Algeria,Grand Total,684,Fat supply quantity (g/capita/day),80.06,79.36,77.4,80.19,77.28


## Questions

In [7]:
##Mean
mean = round(data['2017'].mean(),2)
##Standard Deviation
std = round(data['2017'].std())
print("The Mean and Standard Deviation for Year 2017 is", mean, 'and', std)

The Mean and Standard Deviation for Year 2017 is 140.92 and 1672


What is the total sum of Wine produced in 2015 and 2018 respectively?

In [8]:
##Groupingby Wine for 2015 and 2018
Wine_2015 = round(data.groupby(['item']).sum().loc["Wine",'2015'],2)
Wine_2018 = round(data.groupby(['item']).sum().loc["Wine",'2018'],2)
print("Total sum of Wine produced in 2015 and 2018 is", Wine_2015, "and", Wine_2018)

Total sum of Wine produced in 2015 and 2018 is 4251.81 and 4039.32



Which year had the least correlation with ‘Element Code’?

In [9]:
# Calculating Correlation w.r.t element_code
correlation = data.corr(numeric_only=True)['element_code'].sort_values(ascending=True).idxmin()
print("The year with the least correlation to Element Code is", correlation)

The year with the least correlation to Element Code is 2016


What is the total number and percentage of missing data in 2014 to 3 decimal places?

In [10]:
#Calculating the percentage of missing data for 2014 and the number of missing data
missing_data_2014 = data['2014'].isnull().sum()
total = len(data['2014'])
percentage = round(missing_data_2014 / total * 100 ,3)

print("The total number and percentage of missing data in 2014 is",missing_data_2014,"and", percentage,"%")

The total number and percentage of missing data in 2014 is 1589 and 2.607 %



Perform a groupby operation on ‘Element’.  What year has the highest sum of Stock Variation?

In [11]:
#groupingby Element and indexing Stock Variation
stock_variation = data.groupby(["element"]).sum().loc["Stock Variation"]
#Finding the idxmax for the respective years in the dataset
data_stock = stock_variation[['2014','2015','2016','2017','2018']].idxmax()
#Finding the max value for the respective years in the dataset
data_stock_value = stock_variation[['2014','2015','2016','2017','2018']].max()
print("The year with the max sum of Stock Variation is",data_stock, "with value",data_stock_value)

The year with the max sum of Stock Variation is 2014 with value 58749.83


What is the total number of unique countries in the dataset??

In [12]:
#Finding the unique values in area
unique_countries=len(data.area.unique())
print("The total number of unique countries in the dataset is",unique_countries)

The total number of unique countries in the dataset is 49


How would you check for the number of rows and columns in a pandas DataFrame named df?

In [13]:
#Finding the rows and columns(observation and features) of a dataset
df = data.shape
df

(60943, 9)

Given the following numpy array 

array  = ([[94, 89, 63    [93, 92,       [92, 94, 56]])

How would you select  the elements in bold and italics from th
e array?

In [14]:
#Creating an array
array = np.array([[94, 89, 63],[93, 92, 48],[92, 94, 56]])
print(array[:2,1:])

[[89 63]
 [92 48]]



Consider the following list of tuples:

y = [(2, 4), (7, 8), (1, 5, 9)]

How would you assign element 8 from the list to a variable x?

In [15]:
#Creatinf a list of tuple
y = [(2, 4), (7, 8), (1, 5, 9)]
x=y[1][-1]
print(x)

8



Select columns ‘Y2017’ and ‘Area’, Perform a groupby operation on ‘Area’.  Which of these Areas had the 7th lowest sum in 2017?

In [16]:
#Groupingby sum of Area with respect to 2017, sorting in ascending order and indexing the 7th value
lowest_sum_2017 = data.groupby(['area']).sum()['2017'].sort_values(ascending = True).index[6]
print("The country with the 7th lowest sum in 2017 is", lowest_sum_2017)

The country with the 7th lowest sum in 2017 is Guinea-Bissau



What would be the output for?

S = [['him', 'sell'], [90, 28, 43]]

S[0][1][1]

In [17]:
S = [['him', 'sell'], [90, 28, 43]]

print("Answer:",S[0][1][1])

Answer: e



What is the total Protein supply quantity in Madagascar in 2015?

In [18]:
#Groupingby total sums of element per area for 2017 then indexing Protein supply quantity and Madagascar
protein_supply = data.groupby(["element","area"]).sum()['2015'].loc["Protein supply quantity (g/capita/day)","Madagascar"]
print("The total Protein supply quantity in Madagascar in 2015 is",protein_supply)

The total Protein supply quantity in Madagascar in 2015 is 173.05



Perform a groupby operation on ‘Element’.  What is the total number of the sum of Processing in 2017?

In [19]:
#Groupingby the sum of all element features wrt 2017 and indexing Processing 
processing = data.groupby(["element"]).sum()['2017'].loc["Processing"]
print("The total sum of Processing in 2017 is",processing)

The total sum of Processing in 2017 is 292836.0



If you have the following list

lst = [[35, 'Portugal', 94], [33, 'Argentina', 93], [30 , 'Brazil', 92]]

col = [‘Age’,’Nationality’,’Overall’]

How do you create a pandas DataFrame using this list, to look like the table below?

In [20]:
#Creating a DataFrame from lists
lst = [[35, 'Portugal', 94], [33, 'Argentina', 93], [30 , 'Brazil', 92]]
col = ['Age','Nationality','Overall']
##Creating DataFramea
new = pd.DataFrame(lst, columns = col, index = [i for i in range(1,4)])
new

Unnamed: 0,Age,Nationality,Overall
1,35,Portugal,94
2,33,Argentina,93
3,30,Brazil,92



Which of the following dataframe methods can be used to access elements across rows and columns?

In [21]:
#Indexing rows and columns
data.iloc[:]

Unnamed: 0,area,item,element_code,element,2014,2015,2016,2017,2018
0,Algeria,Population,511,Total Population - Both sexes,38924.00,39728.00,40551.00,41389.00,42228.00
1,Algeria,Population,5301,Domestic supply quantity,0.00,0.00,0.00,0.00,0.00
2,Algeria,Grand Total,664,Food supply (kcal/capita/day),3377.00,3379.00,3372.00,3341.00,3322.00
3,Algeria,Grand Total,674,Protein supply quantity (g/capita/day),94.90,94.35,94.72,92.82,91.83
4,Algeria,Grand Total,684,Fat supply quantity (g/capita/day),80.06,79.36,77.40,80.19,77.28
...,...,...,...,...,...,...,...,...,...
60938,Zimbabwe,Miscellaneous,5142,Food,42.00,46.00,33.00,19.00,16.00
60939,Zimbabwe,Miscellaneous,645,Food supply quantity (kg/capita/yr),3.06,3.33,2.35,1.33,1.08
60940,Zimbabwe,Miscellaneous,664,Food supply (kcal/capita/day),3.00,4.00,3.00,1.00,1.00
60941,Zimbabwe,Miscellaneous,674,Protein supply quantity (g/capita/day),0.10,0.11,0.08,0.04,0.04



Given the following python code, what would the output of the code give?

my_tuppy = (1,2,5,8)

my_tuppy[2] = 6

In [22]:
#Testting an error message
my_tuppy = (1,2,5,8)

my_tuppy[2] = 6

TypeError: 'tuple' object does not support item assignment

Select columns ‘Y2017’ and ‘Area’, Perform a groupby operation on ‘Area’.  Which of these Areas had the highest sum in 2017?

In [23]:
#Groupingby the total sum for area wrt 2017 and indentifying which has the max value
max_area = data.groupby(["area"]).sum()['2017'].idxmax()
print("The country with the highest sum for 2017 was", max_area)

The country with the highest sum for 2017 was Nigeria


# END