#  Basic Data Science in Python - Exercises 21/9  #

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture as GM
import seaborn as sns

### Exercise 1: Data Preprocessing
Let us brush up on the Data Preprocessing we learnt in the last course. 

###### Task 1
First, import the dataset $\texttt{netflix_titles.csv}$ from the data folder. 

##### Task 2
There are some null-values in the duration column. This is because these values have turned up in the rating coloumn instead. Move these duration values from the rating coloumn to the duration coloumn. You can replace them with NaN for now.

###### Task 3
Create two new data frames, one for movies and one for TV-shows. Create a barplot of the age rating of the movies on Netflix.

##### Task 4
Let's see if the percentage of horror movies rated R are higher than the percentage of general movies rated R. Find the percentage of horror movies rated R, and compare that to the percentage of movies rated R

### Exercise 2: Statistics with Python
Use pandas to calculate the contingency table, and use the $\texttt{scipy.stats}$ library to perform a Chi-Squared test for independence on the Titanic dataset, with respects to the class of the passangers, and how many survived. What can you conclude?

In [None]:
from scipy.stats import chi2_contingency
titanic = pd.read_csv("../data/titanic.csv")
contingency = None
x = titanic["Pclass"]
y = titanic["Survived"]
### YOUR CODE HERE - calculate the contingency table

### YOUR CODE HERE

#code to visualize the contingency table
plt.figure(figsize=(6,6))
sns.heatmap(contingency, annot=True, cmap="YlGnBu")

In [None]:
### YOUR CODE HERE - perform chi2 test

### Exercise 3: The Hello World of Scikit Learn
Let's get comfortable with using methods from scikit learn. It is often a good idea to take a look at the [documentation](https://scikit-learn.org/stable/) of a method before using it. Take a look at the documentation for [k-Means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) and use this method to cluster the below dataset.

In [None]:
X, _ = datasets.make_blobs(n_samples=100, centers=2, n_features=2, center_box=(0, 10), cluster_std=0.7)
clusters = [0 for _ in range(X.shape[0])]
### YOUR CODE HERE

### YOUR CODE HERE
plt.scatter(*X.T, c=clusters)

### Exercise 4: PCA and EM (Handin)
Use Principal Component Analysis to reduce the dimensionality of the Iris Dataset to 2D, and then use Gaussian Mixtures to assign the points to three clusters. Plot the result

In [None]:
iris = datasets.load_iris()
X = iris.data
### YOUR CODE HERE

### Exercise 5: PCA as Noise Filtering
Principal Component Analysis can be used to filter noisy data. Below is a dataset consisting of handwritten digits, with added noise:

In [None]:
def plot_digits(digits, label=""):
    fig = plt.figure(figsize=(4, 4))
    plt.title(label)
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

    for i in range(16):
        ax = fig.add_subplot(4, 4, i + 1, xticks=[], yticks=[])
        ax.imshow(digits[i].reshape(8,8), cmap=plt.cm.binary, interpolation='nearest')

#Original dataset
digits = datasets.load_digits().data
plot_digits(digits, label="original data")

#Dataset with noise
np.random.seed(42)
noisy_digits = np.random.normal(digits, 4)
plot_digits(noisy_digits, label="data with noise")

Use PCA to filter some of the noise out. You can do this by first computing the principal components, and then inverse transform these.

In [None]:
### YOUR CODE HERE

### Exercise 6: Implement k-Means (Hard)

Implement a simple k-Means algorithm, and test your implementation on the below dataset. Just run for a fixed number of iterations $\texttt{max_iter}$, so don't worry about convergence.

In [None]:
X, y = datasets.make_blobs(n_samples=100, centers=2, n_features=2, center_box=(0, 10), cluster_std=0.7)
plt.scatter(*X.T, c=y)

In [None]:
def kMeans(X, k=2, max_iter = 100):
    
    #initialize random clusters
    clusters = [0 for _ in range(X.shape[0])]
    mu = X[np.random.choice(X.shape[0], k)]
    
    ### YOUR CODE HERE


    ### YOUR CODE HERE
    
    return clusters
plt.scatter(*X.T, c=kMeans(X))