# Practical Machine Learning
# Toader Liviu Eduard - Group 407

## 1. Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dropout, Dense

## 2. Reading the data

Read the training and validation files

The Twitter messages are the feature <b>x</b> 

The latitude and longitude coordinates are the label <b>y</b>

In [2]:
def read(file):
    columns = ['id', 'lat', 'long', 'message']
    df = pd.read_csv(file, delimiter=',', names=columns)
    x = df['message']
    y = df[['lat', 'long']]
    return x, y

train_x, train_y = read('training.txt')
validation_x, validation_y = read('validation.txt')

Print the messages from the training set

In [3]:
train_x

0        Seit d Vase: "Wenn ich kaputt gang, bringt das...
1        Haha bin au w isch der amig au so richtig lang...
2        isch d hiltl dachterrasse amne samstig viel bs...
3        Ich fÃ¼hle mich wie die Weimarer Republik... .....
4        Eui liebschte Lunchidee zum Mitneh? ðŸ˜¬ En Grill...
                               ...                        
22578    Bin grad in Bus igstige, da seit de Buschauffe...
22579    Rien ne surpassera Dragostea Din Tei de O-zone...
22580    het Ã¶pert au kei bock meh zum schaffa und lust...
22581    Oh wenn wedermol en jodel -5 het wos ned verdi...
22582    Zerst hani glachet das min Kollege Ã¤ndlich au ...
Name: message, Length: 22583, dtype: object

Print the coordinates from the training set

In [4]:
train_y

Unnamed: 0,lat,long
0,51.810067,10.191331
1,51.918188,10.599245
2,52.711074,9.987374
3,52.386711,11.700612
4,52.314631,9.701835
...,...,...
22578,51.884863,10.487841
22579,49.935479,7.051477
22580,50.597534,12.055682
22581,51.848082,8.554886


Read the test file

In [5]:
test_file = pd.read_csv('test.txt', delimiter=',', names=['id', 'message'])
test_x = test_file['message']

## 3. Preprocessing the data

Scale the coordinates to prepare them for the neural network

In [6]:
scaler = StandardScaler().fit(train_y)
train_y = scaler.transform(train_y)
validation_y = scaler.transform(validation_y)

Print the scaled coordinates from the training set

In [7]:
train_y

array([[ 0.09224194,  0.60083834],
       [ 0.2128527 ,  0.90798381],
       [ 1.09733166,  0.44726561],
       ...,
       [-1.260361  ,  2.00463273],
       [ 0.13464846, -0.63135015],
       [ 1.36553917, -0.06534156]])

Create a vocabulary based on the training set

Limit the number of unique words (vocabulary) to 5000

Words out of vocabulary (OOV) will receive a special token

In [8]:
vocabulary = 5000
tokenizer = Tokenizer(num_words=vocabulary, oov_token='<OOV>')
tokenizer.fit_on_texts(train_x)

Convert the words of the messages to integers

Add padding to the left with integer 0 until all messages have the same length

In [9]:
def vectorize(text):
    v = tokenizer.texts_to_sequences(text)
    return pad_sequences(v)

train_x = vectorize(train_x)
validation_x = vectorize(validation_x)
test_x = vectorize(test_x)

Print the converted messages from the training set

In [10]:
train_x

array([[   0,    0,    0, ...,  275,  179, 4854],
       [   0,    0,    0, ...,    6,  249,   33],
       [   0,    0,    0, ...,    3,   59,  159],
       ...,
       [   0,    0,    0, ...,  731,    2, 1320],
       [   0,    0,    0, ...,   78,   22,    1],
       [   0,    0,    0, ..., 1725, 1499,   33]])

## 4. Creating the neural network

Construct a recurrent neural network model with:

- word embeddings with the output dimension 20 (each word from the vocabulary will have 20 values)

- 20 gated recurrent units (I also tried LSTM, but the results were similar or slightly worse on my runs)

- dropout of 20% to avoid overfitting, both inside the GRU and after it

- 20 dense units with ReLU activation function (I also tried other functions, but didn't find any significant improvement)

- 2 outputs (the latitude and longitude coordinates)

In [11]:
model = Sequential([
    Embedding(vocabulary, 20),
    Bidirectional(GRU(20, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(GRU(20)),
    Dropout(0.2),
    Dense(20, activation='relu'),
    Dense(2)
])

I used a 20% validation split from the training set to find out how many epochs to choose

After the 5th epoch, the validation loss is increasing and the training loss is decreasing (overfitting)

Run the neural network for 5 epochs 

In [12]:
model.compile(loss='mean_absolute_error', optimizer='adam')
model.fit(train_x, train_y, epochs=5, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x24ec860a310>

## 5. Testing on the validation set

Calculate the predictions for the validation set

Rescale the predictions and the true values

In [13]:
validation_y_predicted = model.predict(validation_x)
validation_y_predicted = scaler.inverse_transform(validation_y_predicted)
validation_y = scaler.inverse_transform(validation_y)

Print the predictions

In [14]:
validation_y_predicted

array([[52.065086 ,  9.93792  ],
       [51.630386 ,  7.7898655],
       [51.33574  ,  9.023943 ],
       ...,
       [52.182636 ,  9.116333 ],
       [52.026104 ,  9.966114 ],
       [51.905067 ,  9.496813 ]], dtype=float32)

Print the absolute and the squared errors

In [15]:
mean_absolute_error(validation_y_predicted, validation_y)

0.5212843324687777

In [16]:
mean_squared_error(validation_y_predicted, validation_y)

0.5760280447764253

## 6. Making the submission

Calculate the predictions for the test set and rescale them

In [17]:
test_y_predicted = model.predict(test_x)
test_y_predicted = scaler.inverse_transform(test_y_predicted)

Create the submission.txt file

In [18]:
submission = pd.DataFrame()

submission['id'] = list(test_file['id'])
submission['lat'] = [row[0] for row in test_y_predicted]
submission['long'] = [row[1] for row in test_y_predicted]

submission.to_csv('./submission.txt', header=True, index=None, sep=',')

Print the submission

In [19]:
submission

Unnamed: 0,id,lat,long
0,300121,52.127365,10.225500
1,302441,52.056206,10.091029
2,300266,50.494244,7.636882
3,300911,51.040257,11.539071
4,302681,50.416103,7.704494
...,...,...,...
3133,300151,52.179310,10.042127
3134,300302,52.064327,9.225827
3135,301963,51.279819,9.574130
3136,302180,50.815674,8.126704
