# Step 1: Import modules

In [None]:
import numpy as np
import pandas as pd
import math
import statistics as st
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats import kurtosis

# Step 2: Read the dataset

In [None]:
dh = pd.read_csv("College_And_Universities_1.csv")
dh.head(10)

Unnamed: 0,School,Type,Median_SAT,Acceptance_Rate,Expenditures/Student,Top _10percent_HS,Graduation_percent
0,Amherst,Lib Arts,1315,22%,"$26,636",85,93
1,Barnard,Lib Arts,1220,53%,"$17,653",69,80
2,Bates,Lib Arts,1240,36%,"$17,554",58,88
3,Berkeley,University,1176,37%,"$23,665",95,68
4,Bowdoin,Lib Arts,1300,24%,"$25,703",78,90
5,Brown,University,1281,24%,"$24,201",80,90
6,Bryn Mawr,Lib Arts,1255,56%,"$18,847",70,84
7,Cal Tech,University,1400,31%,"$102,262",98,75
8,Carleton,Lib Arts,1300,40%,"$15,904",75,80
9,Carnegie Mellon,University,1225,64%,"$33,607",52,77


# Step 3: Rename (Optional) and Type casting

In [None]:
graduation = dh['Graduation_percent']

median_sat = dh['Median_SAT']

dh['Expenditures/Student'] = dh['Expenditures/Student'].replace({'\$': '', ',': ''}, regex=True).astype(float)
top=dh['Top _10percent_HS']

expend=dh['Expenditures/Student']

dh['Acceptance_Rate'] = dh['Acceptance_Rate'].replace({'%': '', ',': ''}, regex=True).astype(float)
accept=dh['Acceptance_Rate']

# Start doing the statistics

## 4.21.1. Finding covariance of Graduation % and Median SAT

### Covariance formula:
## $cov(X,Y)= \frac{\sum_{i=1}^{n}(x_{i}-\bar{x})(y_{i}-\bar{y})}{n-1}$
> Covariance is a measure of the degree of correlation between two random variables. It measures the direction of correlation (similar or inverse) as well as the degree of correlation between two variables. The limit of covariance is not defined and does not have a fixed value. The covariance value can have range  [$- \infty$ , + $\infty$].

In [None]:
#using the formula - ordinary way:
n=len(graduation)
meanGraduation_or=sum(graduation)/len(graduation)
meanMedianSat_or=sum(median_sat)/len(median_sat)
cov_or = sum((graduation[i] - meanGraduation_or) * (median_sat[i] - meanMedianSat_or) for i in range(n)) / (n-1)
cov_or

263.37032312925174

In [None]:
#using np lib
np.cov(graduation, median_sat)[0, 1]

263.37032312925174

## 4.21.2. Finding correlation coefficient of Graduation % and Median SAT

### Correlation coefficient formula:
## $r_{xy}= \frac{cov(X,Y)}{s_{x}s_{y}}=\frac{\sum_{i=1}^{n}(x_{i}-\bar{x})(y_{i}-\bar{y})}{\sqrt[2]{\sum_{i=1}^{n}(x_{i}-\bar{x})^2(y_{i}-\bar{y})^2}}$

>The correlation coefficient is often used to evaluate the linear relationship between two variables. It has a value in the interval [-1, 1], where the value 1 represents a perfect linear correlation and the value -1 represents a perfect inverse linear correlation.

In [None]:
#using the formula without statistics tools
#finding standard deviation of x:
var_x_or = sum((x - meanGraduation_or) ** 2 for x in graduation) / (n-1)
stdev_x_or = math.sqrt(var_x_or)

#finding standard deviation of y:
var_y_or = sum((y - meanMedianSat_or) ** 2 for y in median_sat) / (n-1)
stdev_y_or = math.sqrt(var_y_or)

s_x_s_y = stdev_x_or * stdev_y_or
corr_or = cov_or/s_x_s_y
corr_or

0.564146826697419

In [None]:
np.corrcoef(graduation, median_sat)[0, 1]

0.5641468266974192

## 4.22. Using the Correlation Tool (For all Colleges and Universities' data)

In [None]:
columns_to_drop = ['School', 'Type']
dh = dh.drop(columns_to_drop, axis=1)
print(dh.columns)

Index(['Median_SAT', 'Acceptance_Rate', 'Expenditures/Student',
       'Top _10percent_HS', 'Graduation_percent'],
      dtype='object')


> In the data table, there are 2 columns with the data type "object", so first, if we want to calculate the correlation of the entire table, we must drop the data of those 2 object columns, "School" and "Type".

In [None]:
dh.corr()

Unnamed: 0,Median_SAT,Acceptance_Rate,Expenditures/Student,Top _10percent_HS,Graduation_percent
Median_SAT,1.0,-0.601902,0.572742,0.503468,0.564147
Acceptance_Rate,-0.601902,1.0,-0.284254,-0.609721,-0.550378
Expenditures/Student,0.572742,-0.284254,1.0,0.505782,0.042504
Top _10percent_HS,0.503468,-0.609721,0.505782,1.0,0.138613
Graduation_percent,0.564147,-0.550378,0.042504,0.138613,1.0
