# Air Pollution 

### 1. Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression

### 2. Input Processing

In [2]:
path_to_data=r"D:\dataScience\machine-learning-online-2018-master\Datasets\airPollution"

In [4]:
# getting the data 
dataModel=pd.read_csv(path_to_data+"\Train.csv")

In [5]:
dataModel.head(n=6)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,0.293416,-0.945599,-0.421105,0.406816,0.525662,-82.154667
1,-0.836084,-0.189228,-0.776403,-1.053831,0.597997,-48.89796
2,0.236425,0.132836,-0.147723,0.699854,-0.187364,77.270371
3,0.175312,0.143194,-0.581111,-0.122107,-1.292168,-2.988581
4,-1.693011,0.542712,-2.798729,-0.686723,1.244077,-37.596722
5,-1.055329,-0.874807,0.580753,-0.773541,1.070888,-122.405263


In [6]:
data=dataModel.values

In [8]:
data.shape # 5 columns are the X_train data, whereas the last column is the Y_train(label data)

(1600, 6)

In [9]:
#doing the same things for the testing without actually showing the table this time
testModel=pd.read_csv(path_to_data+"\Test.csv")
testData=testModel.values

In [10]:
testData.shape # only the testing data and we have to predict the labels

(400, 5)

In [11]:
# dividing the training data between data and labels
X_train=data[:,:5]
Y_train=data[:,5]

In [12]:
print(X_train.shape, Y_train.shape)

(1600, 5) (1600,)


### 3. Applying LOWESS

In [32]:
u = X_train.mean()
std = X_train.std()
X_train = (X_train-u)/std

In [33]:
def getW(query_point,X,tau):
    M = X.shape[0]
    W = np.mat(np.eye(M))
    
    for i in range(M):
        xi = X[i]
        x  = query_point
        W[i,i] =  np.exp(np.dot((xi-x),(xi-x).T)/(-2*tau*tau))
    return W

In [34]:
Y_train=Y_train.reshape((Y_train.shape[0],1))
X = np.mat(X_train)
Y = np.mat(Y_train)
M = X.shape[0]

W = getW(-1,X,100)
print(W.shape)
print(W)

(1600, 1600)
[[0.99967535 0.         0.         ... 0.         0.         0.        ]
 [0.         0.99983029 0.         ... 0.         0.         0.        ]
 [0.         0.         0.99963507 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.99927021 0.         0.        ]
 [0.         0.         0.         ... 0.         0.99938112 0.        ]
 [0.         0.         0.         ... 0.         0.         0.99953438]]


### 4. Make Predictions

In [35]:
def predict(X,Y,query_x,tau):
    ones = np.ones((M,1))
    X_ = np.hstack((X,ones))
    
    qx = np.mat([*query_x,1])
    
    W = getW(qx,X_,tau)
    
    #theta = `(X′WX)inv * X′WY`
    theta = np.linalg.pinv(X_.T*(W*X_))*(X_.T*(W*Y))
    #print(theta.shape)
    pred = np.dot(qx,theta)
    return theta,pred

In [36]:
theta,pred = predict(X,Y,1.0,1.0)

TypeError: 'float' object is not iterable

In [37]:
print(theta)

[[30.88578391]
 [93.28550985]
 [ 7.27114368]
 [46.19840956]
 [ 1.00782365]
 [ 2.30331982]]


In [38]:
print(pred)

[[180.95199046]]


In [39]:
N=testData.shape[0]

In [40]:
def prediction(tau):
    Y_test = []
    for xq in testData:
        theta,pred = predict(X,Y,xq,tau)
        Y_test.append(pred[0][0])
    Y_test=np.array(Y_test)
    print(Y_test.shape)
    return Y_test

In [47]:
taus = [0.1,0.5,1,5,10]
for i,t in enumerate(taus):
    y=prediction(t)
    y=y.reshape((y.shape[0],-1))
    df=pd.DataFrame(y)
    df.to_csv(path_to_data+"\y_test_"+str(i)+".csv",header=["target"])

(400, 1, 1)
(400, 1, 1)
(400, 1, 1)
(400, 1, 1)
(400, 1, 1)
