In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

pd.set_option('display.max_columns', None)
data = pd.read_csv('country_profile_variables.csv')
data.head(30)


#columns is a attribute for a dataframe object which gives the list of columns. Remember attributes are not callable, sometimes some methods are also not callable, we will check that.
data.columns

#Running isnull and isna method because it is important to check the null values first because it is important at ML, null values will disrupt the ML.
data.isnull().sum() 
#method chaining specially when done twice by sum method then remember, the first sum() will give the sum of each column, and two sum() method will give you the sum of count of all columns null values
data.isna().sum().sum()


#duplicated method will give you boolean output of each record either true or false, i.e. duplicate or not duplicate record respectively.
data.duplicated()



# Applied Descriptive Statistics in Python


In [None]:
#Finding the measure of central tendencies and measure of variations for 'Population in thousands (2017) column.

#---------------------------------------Measure-of-Central-Tendencies------------------------------------------------------ #



#Finding mean manually
manual_mean = data['Population in thousands (2017)'].sum()/len(data['Population in thousands (2017)'])
manual_mean



#Finding the mean using inbuilt mean method. Always use numeric_only = True keyword argument because it will throw numeric data column error.
data.mean(numeric_only=True)




#Finding median manually
Middle_count_of_column = len(data['Population in thousands (2017)'])/2
sorted_data = data['Population in thousands (2017)'].sort_values(ascending=True).reset_index()
sorted_data.iloc[114]
#Finding median using inbuilt median method.
data['Population in thousands (2017)'].median(numeric_only=True)





#Finding mode manually
no_of_most_recurring_number = data['Population in thousands (2017)'].value_counts().head(11)
no_of_most_recurring_number.index.sort_values(ascending=True)
#Finding mode using mode method
data['Population in thousands (2017)'].mode()



# --------------------------------------------Measure-of-variances--------------------------------------------------------- #




#Finding variance using var method
variance = data['Population in thousands (2017)'].var()
#We can also find the variance using numpy package
Sample = [42, 61, 39, 70, 8, 30, 52, 28, 95, 86, 35, 77, 80, 86, 1, 84, 90, 57, 45, 6, 1, 56, 60, 83, 24, 63, 93, 59, 92, 53]
variance_using_np = np.var(Sample) #We can't directly do listobject.var(), it won't work as it works only on Dataframe object and can find using numpy package.






#Finding standard deviation using std method
standard_deviation = data['Population in thousands (2017)'].std()
#We can also find the standard deviation using numpy package
Sample = [42, 61, 39, 70, 8, 30, 52, 28, 95, 86, 35, 77, 80, 86, 1, 84, 90, 57, 45, 6, 1, 56, 60, 83, 24, 63, 93, 59, 92, 53]
variance_using_np = np.std(Sample) #We can't directly do listobject.std(), it won't work as it works only on Dataframe object and can find using numpy package.
#Finding standard deviation when variance is given. The relationship between standard deviation and variance is, std is square root of variance.
square_root_of_variance = math.sqrt(variance)
#Finding that square root of variance value is equals to standard deviation value. Remember 'is' keyword is used to check if the two variables are referring to same object in memory. a = [1, 2, 3] b = a | b references the same object as aprint(a is b) | Output: True  | because a and b reference the same object
square_root_of_variance == standard_deviation




#Range in measure of variance finds the difference between maximum value minus (-) minimum value
data['Population in thousands (2017)'].max() - data['Population in thousands (2017)'].min()





#Percentile in measure of variance finds the percentile, percentile In layman's terms, a percentile is a way of understanding where a particular value stands compared to a group of values. Imagine you're taking a test in school, and your score is compared to the scores of everyone else who took the test. If you scored in the 80th percentile, it means you scored as well as or better than 80% of the other test-takers. Courtesy : ChatGPT
percentile_75 = np.percentile(data['Population in thousands (2017)'], 75)
percentile_75
#Percentile_62 variable is giving a scalar value from the Population column, the scalar value tells that 62% of the values are less than 9857.88
percentile_62 = np.percentile(data['Population in thousands (2017)'], 62)
percentile_62





#Quartiles in measure of variance divides the dataset into 4 parts, 1st division is Q1, quarter 25, Q2 quarter 50, and so on. Apparently, it seems both quantile and percentile has no difference but inherently they are different statistically. Here I am giving the 0.62, still it is giving the value, the value means, 62% of data is less than 9857.88
Quartile_2 = data['Population in thousands (2017)'].quantile(0.62)
Quartile_2





#Interquartile Range in measure of variance finds the difference between Q3 - Q1 quartile
'''Imagine you have a group of people lined up based on their scores on a test. The interquartile range is a way to understand how spread out the scores are in the middle of this line-up.

Here's how it works:
Arrange Scores: First, you arrange all the scores from lowest to highest.
Find the Median: Then, you find the median score. This is the score that's right in the middle of all the scores. Half the scores are above it, and half are below it.
Divide into Quarters: Next, you divide the scores into four equal parts based on the median. The scores to the left of the median are the lower half, and the scores to the right are the upper half.
Find the Median of Each Half: In each half, you find the median again. These are called the first quartile (Q1) and the third quartile (Q3). Q1 represents the score below which 25% of the scores fall, and Q3 represents the score below which 75% of the scores fall.
Calculate the IQR: Finally, the interquartile range (IQR) is the difference between Q3 and Q1. It tells you how spread out the scores are in the middle 50% of the data.
So, in simple terms, the interquartile range gives you a sense of how much variability there is among the middle 50% of the scores. It's a measure of the spread of the data that's less affected by extreme values at the very high or low end.'''
Quartile_3 = data['Population in thousands (2017)'].quantile(0.75)
Quartile_1 = data['Population in thousands (2017)'].quantile(0.25)
IQR = Quartile_3 - Quartile_1





#Outlier has two fences, upper and lower fence any value above upper value is an outlier and any value below lower fence is an outlier too
#Finding upper and lower fence, below are the upper and lower fence formula.
# upper_fence = Q3 + 1.5*(IQR)
# lower_fence = Q1 - 1.5*(IQR)
upper_fence_of_population = Quartile_3 + 1.5*IQR
lower_fence_of_population = Quartile_1 - 1.5*IQR
upper_fence_of_population
lower_fence_of_population
#Finding outliers of Population column
data[(data['Population in thousands (2017)'] < lower_fence_of_population) | (data['Population in thousands (2017)'] > upper_fence_of_population)]
#It is little tedious to find 25th percentile, Standard Deviation for all columns. We will use 'describe' method to statistical summary of the dataframe.
#data.describe()




#Plotting a boxplot to find a outlier directly. 

for i in data.columns:
    plt.figure()
    sns.boxplot(x = data[i])
    
        
    
    
    
plt.show()





#Sampling Techniques
#There are three types of sampling technique, meaning this techniques is one way to extract samples from data. We used this in DS.
#1. Random Sampling : It extract random records from the dataframe. sample method is used to get random records or sample of random records.
data.sample(20)
#2. Stratified Sampling : It extracts only those records which fulfills the condition, say condition is we need sample of only those people who's 'Population in thousands (2017)' is greater than 10k and 'Population density (per km2, 2017)' less than 20k
data[(data['Population in thousands (2017)'] > 10000) | (data['Population density (per km2, 2017)'] < 20000)]
#If we want to do stratified and random sampling we can do by simply putting using 'sample' method at the end of the dataframe or series object.
data[(data['Population in thousands (2017)'] > 10000) | (data['Population density (per km2, 2017)'] < 20000)].sample(10)
#Systematic Sampling: It simply gives the every nth element from the column or dataframe. Like every 5th element from the dataframe.
data[::5]



#Correlation : Correlation or relationship between two variable or two columns for that correlation between age and grades. Is increasing age increasing or decreasing grades of the students if not then is it simply not getting affected. If age increased and grades also increasing consistently then we can draw the conlusion that age is positively correlated or directly proportional to grade or inversely or negatively correlated if grades are decreasing. If no changes in grades if age is increased then there is no correlation which is 0, negative starts from 0 to -1, and positive starts from 0 to 1.
#From below correlation table, we can find the correlation between 'GDP per capita(current US$)' and 'Urban population (% of total population)' is '0.396739' which means they are both positively correlated to each other or directly proportional, which if one varaiable increases other also increases by not too linearly, the positive correlation is '0.39'~ if it were 1 or 0.9 then they are increasing proportionally or linearly. Still they are considered as positively correlated.
data.corr(numeric_only=True)
