In [2]:
import pandas as pd

## Note:
In the lecture, it was noticed that the ACW data didn't seem to go above 0.6.
This was due to an error in data collating on my part when anonymising the data.
These mistakes can happen, and it's good practice that you use Data Understanding to find any potential contradictions.

It has been fixed for this plotting, and this may change some of the distributions, and therefore conclusions.

*If something doesn't seem to make sense, question it!*

In [3]:
df = pd.read_csv('./08140marks2018_raw.csv')

FileNotFoundError: [Errno 2] File b'./08140marks2018_raw.csv' does not exist: b'./08140marks2018_raw.csv'

In [None]:
df.info()

In [None]:
df.head()

In [None]:
extracted = df[['Attendance', 'Total', 'Exam', 'Final']]
extracted.sample(5)

In [None]:
cleaned = extracted.fillna(0) #.truncate(after=170)

In [None]:
cleaned = cleaned.rename(columns={'Total': 'ACW'}) # A mapping, from Total -> ACW. Much nicer.

cleaned.sample(5)

In [None]:
def series_pcnt_to_fract(series_item):
    if type(series_item) == str and series_item[-1] == "%":
        #print(float(series_item[:-1]) / 100)
        return float(series_item[:-1]) / 100
    return series_item

def df_apply_pcnt(x):
    return x.apply(series_pcnt_to_fract)

In [None]:
cleaned = cleaned.apply(df_apply_pcnt)

In [None]:
import seaborn as sns

In [None]:
# We can use boolean expressions to obtain a dataframe of all records where attendance is not 0.
# Using .sample(5) to just randomly show 5 samples.
cleaned[ cleaned['Attendance'] != 0 ].sample(5)

In [None]:
# Get attendeance ( our X variable )
# Get ACW Grade ( our Y variable )
# From Dataframes where we use a boolean expression to say the records must have a non-zero ACW grade.
atten_x = cleaned[ cleaned['ACW'] != 0 ]['Attendance']
atten_y = cleaned[ cleaned['ACW'] != 0 ]['ACW']

# Standard boilerplate
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Make a model, fit it, predict on it.
model = LinearRegression()

# Can use .values on a dataframe to obtain a list. Reshape this list to '2D'. See previous live coding.
model = model.fit(atten_x.values.reshape(-1,1), atten_y)
pred_y = model.predict(atten_x.values.reshape(-1,1))

# Draw the raw data with a scatterplot.
sns.scatterplot(atten_x, atten_y, label="Raw")
# Draw the line plot of our X input, to our linear regression output ( line of best fit essentially )
sns.lineplot(atten_x, pred_y, label="Fit")

# Voodoo magic to get the axes, get any lines drawn on it, paint it black.
plt.gca().get_lines()[0].set_color("black")

In [None]:
# Same again, but this time we want to make sure they have a non-zero exam.
atten_x = cleaned[ cleaned['Exam'] != 0 ]['Attendance']
atten_y = cleaned[ cleaned['Exam'] != 0 ]['Exam']

# Standard
model = LinearRegression()
model = model.fit(atten_x.values.reshape(-1,1), atten_y)
pred_y = model.predict(atten_x.values.reshape(-1,1))

sns.scatterplot(atten_x, atten_y, label="Raw")
sns.lineplot(atten_x, pred_y, label="Fit")

# Same again
plt.gca().get_lines()[0].set_color("black")

In [None]:
# Univariate plot of Attendance
sns.distplot(atten_x)

In [None]:
# Univariate plot of Exam grades. Seems to follow a normal distribution. Good!
sns.distplot(atten_y)

In [None]:
# Discretising the Attendance ranges into 5 bands. First we want data where their exam AND ACW are non-zero.

# Improvement compared to lecture delivery.
# Doing implicit indexing like below may cause issues. As the indices change between Exam and ACW potentially.
# Better to do it in two stages.
#d = cleaned[ cleaned['Exam'] != 0][cleaned['ACW'] != 0 ]

# Split up the indexing to first do exam, then ACW. This makes sure the ACW index is tracking onto the Exam-pruned table
# Rather than using the original indexing scheme which may no longer be valid!
d = cleaned[ cleaned['Exam'] != 0 ]
d = d[ d['ACW'] != 0 ]

# We can cut it into 5 ranges. Can provide labels to make things easier.
d['Atten Type'] = pd.cut(d['Attendance'], 5)#, labels=["very poor", "poor", "okay", "good", "very good"])

In [None]:
d.sample(5) # We can see our new column feature.

In [None]:
# Boxplot the new categorical against the continuous ACW
sns.boxplot(x='Atten Type', y='ACW', data=d)

In [None]:
# Same again for Exam
sns.boxplot(x='Atten Type', y='Exam', data=d)

In [None]:
# Similarly we can get simple metrics of our data too. Depending on spread this could be useful as a heuristic for outliers.
d['ACW'].mean(), d['ACW'].std()

In [None]:
# Univariate plot for our ACW.
sns.distplot(d['ACW'])