In [14]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

ufos = pd.read_csv('data/ufos.csv')
ufos = pd.DataFrame({'Seconds': ufos['duration (seconds)'], 'Country': ufos['country'],'Latitude': ufos['latitude'],'Longitude': ufos['longitude']})

#show unique countries:
#print(ufos.Country.unique())
ufos.dropna(inplace=True)
ufos = ufos[(ufos['Seconds'] >= 1) & (ufos['Seconds'] <= 60)]
#ufos.info()

#convert the text value of countries to a number
ufos['Country'] = LabelEncoder().fit_transform(ufos['Country'])

#split into testing a training with the selected features
Selected_features = ['Seconds','Latitude','Longitude']
X = ufos[Selected_features]
y = ufos['Country'] #a country id is what will be predicted
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#train the model using logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

#the accuracy is high, which isn't surprising because latitude and longitude should obviously correlate
print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('Accuracy: ', accuracy_score(y_test, predictions))

#'pickling' the model
model_filename = 'ufo-model.pkl'
pickle.dump(model, open(model_filename,'wb'))

#test it against a sample data array containing values for seconds, latitude and longitude
model = pickle.load(open('ufo-model.pkl','rb'))
print(model.predict([[50,44,-12]]))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      0.02      0.05       250
           2       0.00      0.00      0.00         8
           3       0.94      1.00      0.97       131
           4       0.95      1.00      0.97      4743

   micro avg       0.95      0.95      0.95      5173
   macro avg       0.78      0.60      0.60      5173
weighted avg       0.95      0.95      0.93      5173

('Predicted labels: ', array([4, 4, 4, ..., 3, 4, 4]))
('Accuracy: ', 0.9512855209742895)
[3]
