# Imports and Installs

In [0]:
# install latest version of polyglot language detection library for stretch
# !pip install -U git+https://github.com/aboSamoor/polyglot.git@master

In [0]:
!pip install plotly

In [0]:
import pandas as pd
import numpy as np 
import pandas_profiling 
from sklearn import preprocessing # for category encoder
# from polyglot.detect import Detector # for language detection stretch
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
# much more efficient for larger files like Nearest Neighbors which the model
import joblib
import plotly.express as px

# EDA & PROFILE REPORT

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/aguilargallardo/DS-Unit-2-Applied-Modeling/master/data/SpotifyFeatures.csv')
# pandas_profiling.ProfileReport(df)

In [0]:
df.head()

In [0]:
df.describe()

In [0]:
def create_web_chart(df):
    music_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
    
    fig = go.Figure()

    for idx, row in df.iterrows():
        # vals = list(df.iloc[idx][music_features].values)
        row_df = pd.DataFrame(df.loc[idx][music_features].T).reset_index().rename(columns={idx:'values', 'index':'feature'})
        fig.add_trace(go.Scatterpolar(
                                    r=row_df['values'],
                                    theta=row_df['feature'],
                                    fill='toself',
                                    name= f'{row.artist_name} - {row.track_name}'
                    ))
    # fig.show()
    return fig

fig = create_web_chart(df.iloc[0:5])
fig.show()

# first_song_values = list(df.iloc[0][music_features].values)
# second_song_values = list(df.iloc[1][music_features].values)

In [0]:
fig = px.bar(first_song_df, x='feature', y="values", color='feature')
fig.show()

In [0]:
fig.add_trace(go.Scatterpolar(
      r=second_song_df['values'],
      theta=second_song_df['feature'],
      fill='toself',
      name='Second Song'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 1]
    )),
  showlegend=True
)

fig.show()

In [0]:
df.isna().sum()

# DATA ENGINEERING CATEGORY ENCODING OF OBJECT FEATURES
note: might need to create another column of encoded languages
resources below: https://chrisalbon.com/machine_learning/preprocessing_structured_data/convert_pandas_categorical_column_into_integers_for_scikit-learn/


In [0]:
# # test of polyglot detector

# arabic_text = u"""
# أفاد مصدر امني في قيادة عمليات صلاح الدين في العراق بأن " القوات الامنية تتوقف لليوم
# الثالث على التوالي عن التقدم الى داخل مدينة تكريت بسبب
# انتشار قناصي التنظيم الذي يطلق على نفسه اسم "الدولة الاسلامية" والعبوات الناسفة
# والمنازل المفخخة والانتحاريين، فضلا عن ان القوات الامنية تنتظر وصول تعزيزات اضافية ".
# """

# detector = Detector(arabic_text)
# print(detector.language)

"""
   TODO: figure out UTF-8 ERROR or just do try catch exception and return 
   unknown or something
   below code does not work
"""

# df.track_name = df.track_name.astype(str)
# df['poly_obj'] = df.track_name.apply(lambda x: Detector(x, quiet=True))
# df['Track-lang'] = df['poly_obj'].apply(lambda x: icu.Locale.getDisplayName(x.language.locale))
# df['Track-LangConfidence'] = df['poly_obj'].apply( lambda x: x.language.confidence)


In [0]:
df.key.unique()

In [0]:
df.key.value_counts()

In [0]:
# df = df.dropna() # drop null values

def pre_process(df):
    time_sig_encoding = { '0/4' : 0, '1/4' : 1, 
                     '3/4' : 3, '4/4' : 4,
                     '5/4' : 5}

    key_encoding = { 'A' : 0, 'A#' : 1, 'B' : 2,
                'C' : 3,  'C#' : 4,  'D' : 5,
                'D#' : 6, 'E' : 7, 'F' : 8,
                'F#' : 9, 'G' : 10, 'G#' : 11 }

    mode_encoding = { 'Major':0, 'Minor':1}      

    df['key'] = df['key'].map(key_encoding)
    df['time_signature'] = df['time_signature'].map(time_sig_encoding)
    df['mode'] = df['mode'].map(mode_encoding)

    # helper function to one hot encode genre
    def encode_and_bind(original_dataframe, feature_to_encode):
        dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
        res = pd.concat([original_dataframe, dummies], axis=1)
        return(res)

    df = encode_and_bind(df, 'genre')
    return df

processed_df = pre_process(df)

# df = df.dropna() # drop null values again not sure why it created null values

# MODELING: Nearest Neighbors
resources: https://scikit-learn.org/stable/modules/neighbors.html

In [0]:
neigh = NearestNeighbors(n_neighbors=10)

In [0]:
# to remove the transformed columns from model 
# remove = ['key', 'mode','time_signature']
# features = [i for i in list(df.columns[4:]) if i not in remove]

features = list(processed_df.columns[4:])

# target = 'track_id'

In [0]:
X = processed_df[features].values
# y = df[target]

X.shape # y.shape

In [0]:
%time
neigh.fit(X) # NN doesn't need to fit Y

In [0]:
# random ariana grande song 
test_song = X[9027] 

df.iloc[9027]

In [0]:
distance, neighbors = neigh.kneighbors(np.array([test_song]))

distance

In [0]:
neighbors

In [0]:
neighbors_df = df.iloc[neighbors[0]]

In [0]:
fig = create_web_chart(neighbors_df)
fig.show()

In [0]:
for idx in neighbors[0][1:]:
    row = df.iloc[idx]
    print(f'Artist: {row.artist_name} - Track: {row.track_name}')

In [0]:
# make a function that takes a song ID
# and returns a list of the 

# Export Model with Joblib

In [0]:
filename = 'NearestNeighborGenres.sav'

In [0]:
joblib.dump(neigh, filename)

In [0]:
# one hot encoding, rolled back
# def encode_and_bind(original_dataframe, feature_to_encode):
#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
#     res = pd.concat([original_dataframe, dummies], axis=1)
#     return(res)

# df = encode_and_bind(df, 'key')
# df = encode_and_bind(df, 'mode')
# df = encode_and_bind(df, 'time_signature')

In [0]:
import pickle

model = #whatever your model is (AFTER training)

#to save the model variable as a pickle file:

filename = 'filename.pkl'

with open(filename, 'wb') as f:
    pickle.dump(model, f)
#then send this pkl file to DE team

#DE team:
#Make sure to save the pkl file in the same folder as the app

#To read it into the app:

with open(filename, 'rb') as f:
    model = pickle.load(f)