## Import all the essential libraries

In [3]:
!pip install nltk
!pip install torch
!pip install torchaudio
!pip install librosa
!pip install soundfile
!pip install scipy
!pip install IPython
!pip install transformers
!pip install huggingface_hub


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Collecting librosa
  Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
     ------------------------------------ 214.3/214.3 kB 726.3 kB/s eta 0:00:00
Collecting pooch>=1.0
  Downloading pooch-1.6.0-py3-none-any.whl (56 kB)
     -------------------------------------- 56.3/56.3 kB 742.2 kB/s eta 0:00:00
Collecting resampy>=0.2.2
  Downloading resampy-0.4.0-py3-none-any.whl (3.1 MB)
     ---------------------------------------- 3.1/3.1 MB 1.1 MB/s eta 0:00:00
Collecting numba>=0.45.1
  Downloading numba-0.56.0-cp39-cp39-win_amd64.whl (2.5 MB)
     ---------------------------------------- 2.5/2.5 MB 1.4 MB/s eta 0:00:00
Collecting audioread>=2.1.9
  Downloading audioread-3.0.0.tar.gz (377 kB)
     -------------------------------------- 377.0/3


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# transformers and torch
import torch
import librosa
import numpy as np
import soundfile as sf
from scipy.io import wavfile
from IPython.display import Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

#nltk and sklearn
from nltk.tag.perceptron import PerceptronTagger
from nltk.stem import WordNetLemmatizer
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from stemming.porter2 import stem
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from scipy.sparse import hstack

## Load tokenizer and model

In [2]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Check audio samples

In [4]:
file_name_0 = 'voc-audio-0.wav'
Audio(file_name_0)

In [7]:
file_name_1 = 'voc-audio-1.wav'
Audio(file_name_1)

In [8]:
file_name_2 = 'voc-audio-2.wav'
Audio(file_name_2)

In [203]:
import os
import wave

path = "C:\\Users\\Theodore\\Downloads\\wave2vec-speech-to-text"
directory = os.fsencode(path)
  
# Function to read audio files
def read_text_file(file_path):
    with open(file_path, 'r') as f:
        print(f.read())
        
def create_audio_list(directory):
    """
    This function will take a directory containing the audio files (in wave format) 
    and return an list of the audio file names as strings.
    
    Args:
         path (str): an encoded path of the directory containing the audio files
    Returns:
        list: a list of strings with names of the 
    """
    audiolist = []
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".wav"): # Check whether file is in wav format or not
            audiolist.append(f"{filename}")
    return audiolist


def get_audiofile_metadata(list_of_audio_files):
    """"
    Function that takes a list of audio files as input and prints out metadata about each file. This includes sampling rate (Hz), length, number of channels (int), sample width (bytes)
    
    Args:
        list: an encoded path of the directory containing the audio files.
    Returns:
        textfile: a text file with containing metadata information about each audio file
    """
    for f in list_of_audio_files:
        with wave.open(f) as w:
            framerate = w.getframerate()
            frames = w.getnframes()
            channels = w.getnchannels()
            width = w.getsampwidth()
            print('\n')
            print('\t','%'*25, f"{f}", '%'*25)
            print('sampling rate:', framerate, 'Hz')
            print('length:', frames, 'samples')
            print('channels:', channels)
            print('sample width:', width, 'bytes')

            data = w.readframes(frames)

def get_transcriptions(audio_list):
    """
    
    """
    transcriptions = []
    for voicenote in audio_list:
        data = wavfile.read(voicenote)
        framerate = data[0]
        sounddata = data[1]
        time = np.arange(0,len(sounddata))/framerate
        input_audio, _ = librosa.load(voicenote, sr=16000)
        
        input_values = tokenizer(input_audio, return_tensors="pt").input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)

        transcription = tokenizer.batch_decode(predicted_ids)[0]
        transcriptions.append(transcription)
        with open('data.txt', 'w') as f:
            for line in transcriptions:
                f.write(line)
                f.write('\n')
    return transcriptions

%time get_transcriptions(audiolist)

def readFile(fileName):
    """
    This function will read the text files passed & return the list
    """
    fileObj = open(fileName, "r") #opens the file in read mode
    words = fileObj.read().splitlines() #puts the file into a list
    fileObj.close()
    return words
data = readFile('data.txt')

['voc-audio-0.wav',
 'voc-audio-1.wav',
 'voc-audio-10.wav',
 'voc-audio-100.wav',
 'voc-audio-101.wav',
 'voc-audio-102.wav',
 'voc-audio-103.wav',
 'voc-audio-104.wav',
 'voc-audio-105.wav',
 'voc-audio-106.wav',
 'voc-audio-107.wav',
 'voc-audio-108.wav',
 'voc-audio-109.wav',
 'voc-audio-11.wav',
 'voc-audio-110.wav',
 'voc-audio-111.wav',
 'voc-audio-112.wav',
 'voc-audio-113.wav',
 'voc-audio-114.wav',
 'voc-audio-115.wav',
 'voc-audio-116.wav',
 'voc-audio-117.wav',
 'voc-audio-118.wav',
 'voc-audio-119.wav',
 'voc-audio-12.wav',
 'voc-audio-120.wav',
 'voc-audio-121.wav',
 'voc-audio-122.wav',
 'voc-audio-123.wav',
 'voc-audio-124.wav',
 'voc-audio-125.wav',
 'voc-audio-126.wav',
 'voc-audio-127.wav',
 'voc-audio-128.wav',
 'voc-audio-129.wav',
 'voc-audio-13.wav',
 'voc-audio-130.wav',
 'voc-audio-131.wav',
 'voc-audio-132.wav',
 'voc-audio-133.wav',
 'voc-audio-134.wav',
 'voc-audio-135.wav',
 'voc-audio-136.wav',
 'voc-audio-137.wav',
 'voc-audio-138.wav',
 'voc-audio-139.wa

In [274]:
import wave

def get_audiofile_metadata(list_of_audio_files):
    """"
    Function that takes a list of audio files as input and prints out metadata about each file. This includes sampling rate (Hz), length, number of channels (int), sample width (bytes)
    
    Args:
        list: an encoded path of the directory containing the audio files.
    Returns:
        textfile: a text file with containing metadata information about each audio file
    """
    for f in list_of_audio_files:
        with wave.open(f) as w:
            framerate = w.getframerate()
            frames = w.getnframes()
            channels = w.getnchannels()
            width = w.getsampwidth()
            print('\n')
            print('\t','%'*25, f"{f}", '%'*25)
            print('sampling rate:', framerate, 'Hz')
            print('length:', frames, 'samples')
            print('channels:', channels)
            print('sample width:', width, 'bytes')

            data = w.readframes(frames)
            
get_audiofile_metadata(audiolist)



	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-0.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 61116 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-1.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 39564 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-10.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 98094 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-100.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 197738 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-101.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 207773 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-102.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 45735 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-103.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampl



	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-160.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 59403 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-17.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 116871 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-18.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 52792 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-19.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 24000 Hz
length: 110592 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-2.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 22050 Hz
length: 92363 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-20.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampling rate: 24000 Hz
length: 130176 samples
channels: 1
sample width: 2 bytes


	 %%%%%%%%%%%%%%%%%%%%%%%%% voc-audio-21.wav %%%%%%%%%%%%%%%%%%%%%%%%%
sampli

## Adjust sample rate and Generate transcriptions

In [275]:
def get_transcriptions(audio_list):
    """
    
    """
    transcriptions = []
    for voicenote in audio_list:
        data = wavfile.read(voicenote)
        framerate = data[0]
        sounddata = data[1]
        time = np.arange(0,len(sounddata))/framerate
        input_audio, _ = librosa.load(voicenote, sr=16000)
        
        input_values = tokenizer(input_audio, return_tensors="pt").input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)

        transcription = tokenizer.batch_decode(predicted_ids)[0]
        transcriptions.append(transcription)
        with open('data.txt', 'w') as f:
            for line in transcriptions:
                f.write(line)
                f.write('\n')
    return transcriptions

In [281]:
import time

print('\t','%'*25, 'Transcription started', '%'*25)
%time get_transcriptions(audiolist)

CPU times: total: 9min 36s
Wall time: 6min 42s


['WHAT IS THE TRACKING NUMBER FOR MY CART THAT WAS MAILED',
 'HOW DO I TRACK MY CARD',
 "I'M STARTING TO THINK MY CARD IS LOST BECAUSE IT STILL HASN'T ARRIVED CAN YOU HELP",
 "I RECENTLY GOT A NEW PLACE WHILE I'M STAYING ABROAD AND HAVE BEEN USING THIS ACCOUNT TO MANAGE PAYMENTS BUT SUDDENLY I AM SEEING FEES INCREASE WHERE ARE THESE ADDITIONAL FEES COMING FROM",
 'I PURCHASED SOME MAKE UP THROUGH A SIGHT IN CHINA AND I WAS UNDER THE IMPRESSION THAT WHEN I MAY TRANSFERS THERE IS NO FEE WHY AM I SEEING THIS FENOW I AM NOT HAPPY ABOUT THIS AT ALL',
 "I'M NOT PAYING THIS TRANSFER FEE",
 'I THOUGHT TRANSFERS WERE FREE WHY WAS I CHARGE TO FEE',
 'WHAT IS THE TRANSFER F E CHARGE',
 'WHY DID I GET CHARGED FOR SOMETHING I BOUGHT ON LINE EVEN THOUGH IT WAS INTERNATIONAL I THOUGHT IT WOULD BE COVERED',
 "MY TRANSFER TO BENEFICIARY DIDN'T GO THROUGH",
 "WHY CAN'T I TRANSFER TO A BENEFICIARY",
 'WHY DID I RECEIVE AN ERROR MESSAGE SAYING THAT MY TRANSFER WAS NOT POSSIBLE',
 "I'VE TROD NUMEROUS TIMES

### Note: 
Throughput will be a challenge! The model takes a while to provide all transcriptions. Should consider using a faster model (or pruning) as this will cause latency issues at scale.

# Text Clustering 

In [215]:
def readFile(fileName):
    """
    This function will read the text files passed & return the list
    """
    fileObj = open(fileName, "r") #opens the file in read mode
    words = fileObj.read().splitlines() #puts the file into a list
    fileObj.close()
    return words

data = readFile('data.txt')
#data

In [284]:
data

['WHAT IS THE TRACKING NUMBER FOR MY CART THAT WAS MAILED',
 'HOW DO I TRACK MY CARD',
 "I'M STARTING TO THINK MY CARD IS LOST BECAUSE IT STILL HASN'T ARRIVED CAN YOU HELP",
 "I RECENTLY GOT A NEW PLACE WHILE I'M STAYING ABROAD AND HAVE BEEN USING THIS ACCOUNT TO MANAGE PAYMENTS BUT SUDDENLY I AM SEEING FEES INCREASE WHERE ARE THESE ADDITIONAL FEES COMING FROM",
 'I PURCHASED SOME MAKE UP THROUGH A SIGHT IN CHINA AND I WAS UNDER THE IMPRESSION THAT WHEN I MAY TRANSFERS THERE IS NO FEE WHY AM I SEEING THIS FENOW I AM NOT HAPPY ABOUT THIS AT ALL',
 "I'M NOT PAYING THIS TRANSFER FEE",
 'I THOUGHT TRANSFERS WERE FREE WHY WAS I CHARGE TO FEE',
 'WHAT IS THE TRANSFER F E CHARGE',
 'WHY DID I GET CHARGED FOR SOMETHING I BOUGHT ON LINE EVEN THOUGH IT WAS INTERNATIONAL I THOUGHT IT WOULD BE COVERED',
 "MY TRANSFER TO BENEFICIARY DIDN'T GO THROUGH",
 "WHY CAN'T I TRANSFER TO A BENEFICIARY",
 'WHY DID I RECEIVE AN ERROR MESSAGE SAYING THAT MY TRANSFER WAS NOT POSSIBLE',
 "I'VE TROD NUMEROUS TIMES

## Tokenise and Remove Stopwords

In [246]:
nltk.download('punkt')
nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords

# Get english stop words
en_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Theodore\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [254]:
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenise_remove_stopwords(data):
        """ Takes in textual data, converts it to lowercase, tokenises and removes stopwords
    Args:
        list of strings (str): input string
    Returns:
        list: a list of tokens with stopwords removed
    """
    filtered_words = []
    for i in np.arange(0,len(data)):
        words = word_tokenize(data[i].lower())
        for w in words:
            if w not in en_stopwords:
                filtered_words.append(w)
    print(filtered_words)

['tracking', 'number', 'cart', 'mailed', 'track', 'card', "'m", 'starting', 'think', 'card', 'lost', 'still', "n't", 'arrived', 'help', 'recently', 'got', 'new', 'place', "'m", 'staying', 'abroad', 'using', 'account', 'manage', 'payments', 'suddenly', 'seeing', 'fees', 'increase', 'additional', 'fees', 'coming', 'purchased', 'make', 'sight', 'china', 'impression', 'may', 'transfers', 'fee', 'seeing', 'fenow', 'happy', "'m", 'paying', 'transfer', 'fee', 'thought', 'transfers', 'free', 'charge', 'fee', 'transfer', 'f', 'e', 'charge', 'get', 'charged', 'something', 'bought', 'line', 'even', 'though', 'international', 'thought', 'would', 'covered', 'transfer', 'beneficiary', "n't", 'go', 'ca', "n't", 'transfer', 'beneficiary', 'receive', 'error', 'message', 'saying', 'transfer', 'possible', "'ve", 'trod', 'numerous', 'times', 'submit', 'transfer', 'funds', "n't", 'going', 'card', 'arrived', 'yet', 'tried', 'transfer', 'money', 'said', "n't", 'possible', "'ve", 'done', 'worked', "n't", 'wor

## Clustering with BERT embeddings

In [262]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 86.0/86.0 kB 1.2 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torchvision
  Downloading torchvision-0.13.1-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 1.4 MB/s eta 0:00:00
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 1.4 MB/s eta 0:00:00
Using legacy 'setup.py install' for sentence-transformers, since package 'wheel' is not installed.
Installing collected packages: sentencepiece, torchvision, sentence-transformers
  Running setup.py install for sentence-transformers: started
  Running setup.py install for sentence-transformers: finished with status 'done'
Successfully installed sentence-transformers-2.2.2 sentencepiece-0

In [270]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
corpus_embeddings = embedder.encode(data)

num_clusters = 10
clustering_model = KMeans(n_clusters=num_clusters) # Define kmeans model
clustering_model.fit(corpus_embeddings) # Fit the embedding with kmeans clustering.
cluster_assignment = clustering_model.labels_ # Get the cluster id assigned to each news headline.

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(data[sentence_id])
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['WHEN WILL I RECEIVE A YU S TRANSFER', 'HOW LONG DOES IT TAKE FOR A TRANSFER', 'WHAT IS THE NUMBER OF DAYS I HAVE TO WAIT FOR MY EUROPE TRANSFER', 'IF I STARTED THE BANK TRANSFER FROM EUROPE HOW LONG WILL THE PROCESS TAKE TO COMPLETE', 'HOW LONG IS THE WAIT FOR A UAS TRANSFER', 'IS IT NORMAL TO HAVE TO WAIT OVER A WEEK FOR MY NEW CART', "I STILL DON'T HAVE MY CART AFTER TWO WEEKS WHAT SHOULD I DO"]

Cluster  2
['HOW LONG UNTIL THE MONEY IS IN MY ACCOUNT', 'WHEN SHOULD I EXPECT TO SEE MY TRANSFER HIT MY ACCOUNT', 'I HAVE TO VERIFY THE SOURCE OF MY FUNDS', 'CAN I VERIFY THE SOURCE OF MY FUNDS', 'A TRANSACTION IS REPEATED SEVERAL TIMES ON MY ACCOUNT', 'THERE IS MORE THAN ONE OF THE SAME TRANSACTION ON MY ACCOUNT', 'CAN MY FRIENDS SEND ME MONEY', 'WHAT DIFFERENT WAYS ARE THERE FOR SOME ONE TO SEND ME MONEY', 'HOW DO I DEPOSIT MY PAY CHECK TO THIS ACCOUNT', 'HOW CAN SOME ONE SEND ME MONEY', 'HOW DO I GET MY SALARY IN THE ACCOUNT', 'CAN I GET MY PAYCHECK THROUGH HERE']

Cluster  

In [None]:
print(cluster_assignment)

## Clustering with all-MiniLM embeddings

In [272]:
embedder_1 = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings_1 = embedder_1.encode(data)

# Perform K-Means Clustering
num_clusters = 6
clustering_model_1 = KMeans(n_clusters=num_clusters)
clustering_model_1.fit(corpus_embeddings_1) # Fit the embedding with kmeans clustering.
cluster_assignment_1 = clustering_model_1.labels_ # Get the cluster id assigned to each news headline.

clustered_sentences_1 = [[] for i in range(num_clusters)]

for sentence_id, cluster_id in enumerate(cluster_assignment_1):
    clustered_sentences_1[cluster_id].append(data[sentence_id])
for i, cluster in enumerate(clustered_sentences_1):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['WHY DID I GET CHARGED FOR SOMETHING I BOUGHT ON LINE EVEN THOUGH IT WAS INTERNATIONAL I THOUGHT IT WOULD BE COVERED', 'WHY WAS MY PAYMENT REVERSED', 'A TRANSACTION IS REPEATED SEVERAL TIMES ON MY ACCOUNT', 'CAN YOU PLEASE CHECK IF I WAS CHARGED TWICE', 'PLEASE CHECK PAYMENTS ON MY CART THERE IS A DUPLICATE AND I ONLY BOUGHT IT ONCE', 'I GOT DOUBLE CHARGED FOR A PAYMENT SO HOW DO I FIX THAT', 'I WAS WONDERING HOW I COULD HAVE TWO CHARGES FOR THE SAME ITEM HAPPEN MORE THAN ONCE IN A SEVEN DAY PERIOD IS THERE ANYWAY I COULD GET THIS CORRECTED A S A P', 'I SEE WHAT LOOKS LIKE DUPLICATE CHARGES ON ACCOUNT', 'IT APPEARS THAT I AM BEING DOUBLE CHARGED FOR SOME ITEMS THAT I HAVE PURCHASED THIS PAST WEEK PLEASE REVIEW AND CORRECT', 'WHY AM I BEING CHARGED TWICE', 'I WOULD LIKE TO KNOW WHY I WAS CHARGED TWICE FOR MY PURCHASE', "I'VE BEEN CHARGED MORE THAN ONCE FOR THE SAME TRANSACTION", 'THERE IS MORE THAN ONE OF THE SAME TRANSACTION ON MY ACCOUNT', "I DIDN'T BUY THIS TWICE", 'IT LO

In [287]:
print(clustered_sentences_1[0][0])

WHY DID I GET CHARGED FOR SOMETHING I BOUGHT ON LINE EVEN THOUGH IT WAS INTERNATIONAL I THOUGHT IT WOULD BE COVERED


In [292]:
cluster_assignment_1[0]

3

## Flask API

In [283]:
from flask import Flask, request, redirect, url_for, flash, jsonify
import pickle as p
import json
import requests


app = Flask(__name__)


@app.route('/api/', methods=['POST'])
def makecalc():
    data = request.get_json()
    prediction = np.array2string(model.predict(data))

    return jsonify(prediction)

if __name__ == '__main__':
    modelfile = 'models/final_prediction.pickle'
    model = p.load(open(modelfile, 'rb'))
    app.run(debug=True, host='0.0.0.0')
    
url = 'http://0.0.0.0:4000/api/'

data = [[14.34, 1.68, 2.7, 25.0, 98.0, 2.8, 1.31, 0.53, 2.7, 13.0, 0.57, 1.96, 660.0]]
j_data = json.dumps(data)
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
r = requests.post(url, data=j_data, headers=headers)
print(r, r.text)

FileNotFoundError: [Errno 2] No such file or directory: 'models/final_prediction.pickle'