In [7]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [10]:

DOWNLOAD_ROOT = "http://www.macs.hw.ac.uk/%7Eek19/data/"
CURRENT_PATH = os.path.join(os.getcwd(), "datasets")
X_FILE = "x_train_gr_smpl.csv"
Y_FILE ="y_train_smpl.csv"

def fetch_data(download_root=DOWNLOAD_ROOT, current_path=CURRENT_PATH):
    if os.path.isfile(os.path.join(current_path, X_FILE)) or os.path.isfile(os.path.join(current_path, Y_FILE)):
        return
    
    os.makedirs(current_path, exist_ok=True)
    
    urllib.request.urlretrieve(download_root + X_FILE, os.path.join(current_path, X_FILE))
    urllib.request.urlretrieve(download_root + Y_FILE, os.path.join(current_path, Y_FILE))
    

In [11]:
fetch_data()

In [12]:
def load_features_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, X_FILE))

def load_labels_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, Y_FILE))

In [13]:
features = load_features_data()
features.head() # Display the top five rows of the dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
0,78.0,77.0,76.0,82.0,87.0,92.0,104.0,119.0,117.0,120.0,...,87.0,79.0,72.0,76.0,83.0,95.0,99.0,98.0,95.0,94.0
1,73.0,75.0,79.0,78.0,76.0,75.0,89.0,107.0,133.0,125.0,...,96.0,93.0,85.0,77.0,69.0,73.0,83.0,100.0,101.0,101.0
2,72.0,75.0,79.0,77.0,81.0,89.0,105.0,109.0,86.0,90.0,...,98.0,95.0,88.0,80.0,73.0,71.0,74.0,80.0,89.0,95.0
3,67.0,70.0,74.0,80.0,93.0,107.0,110.0,96.0,69.0,100.0,...,112.0,92.0,87.0,82.0,77.0,72.0,70.0,72.0,81.0,88.0
4,74.0,74.0,73.0,72.0,77.0,87.0,104.0,109.0,84.0,83.0,...,100.0,98.0,99.0,100.0,99.0,89.0,78.0,66.0,68.0,72.0


In [14]:
#This method prints information about a DataFrame including the dtype and columns, non-null values and memory usage.
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9690 entries, 0 to 9689
Columns: 2304 entries, 0 to 2303
dtypes: float64(2304)
memory usage: 170.3 MB


In [15]:
#This method shows a statistical summary of the numerical attributes 
features.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
count,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,...,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0,9690.0
mean,89.658514,89.545201,89.343756,89.56677,89.81259,89.853251,89.883282,89.978431,90.170175,90.519092,...,72.234881,71.833437,71.394118,70.572136,69.660165,68.878638,68.29453,67.831889,67.548297,67.393189
std,77.521341,77.430539,77.350452,77.498754,77.437292,77.279125,77.085714,77.081732,77.062371,77.155644,...,65.369022,65.439225,65.402298,65.144238,64.885963,64.530366,64.27455,63.79935,63.437102,63.506304
min,5.0,5.0,4.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,...,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,4.0
25%,32.0,32.0,32.0,31.0,32.0,32.0,32.0,33.0,33.0,33.0,...,26.0,26.0,26.0,26.0,26.0,25.0,25.0,25.0,25.0,25.0
50%,57.0,58.0,57.0,58.0,58.0,58.0,58.0,58.0,59.0,59.0,...,46.0,46.0,46.0,45.0,45.0,44.0,44.0,43.0,43.0,43.0
75%,126.0,124.0,122.75,123.0,125.0,123.75,124.0,124.0,124.0,126.0,...,95.0,93.0,91.0,88.0,86.0,84.0,82.0,82.0,81.0,81.0
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [16]:
labels = load_labels_data()
labels.head() # Display the top five rows of the dataframe

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
