## Load essentials

In [2]:
#Importing data packages
import pandas as pd
import numpy as np
import mlflow

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [3]:
%run /Meetings/helpers/Test_Model

In [4]:
%run /Meetings/helpers/Data_Creation

##Random Forest

###DTM

In [7]:
def create_dtm(dataframe, columnname):
  vec = TfidfVectorizer()
  X = vec.fit_transform(dataframe[columnname])
  dtm = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
  return vec, dtm

###Classifier

In [9]:
def splitdata_rf(dtm, labeldf, labelcolumn):
  # Using Skicit-learn to split data into training and testing sets
  # Split the data into training and testing sets
  train_features, test_features, train_labels, test_labels = train_test_split(dtm, labeldf.iloc[:,labelcolumn], test_size = 0.20, random_state = 42)
  return train_features, test_features, train_labels, test_labels

In [10]:
def randomforest(dtm, labeldf, labelcolumn):
  train_features, test_features, train_labels, test_labels = splitdata_rf(dtm, labeldf, labelcolumn)
  #Train classifier 
  randomforest = RandomForestClassifier(n_estimators=10, random_state=0, verbose=3)  
  randomforest.fit(train_features, train_labels)  
  #Predict test set labels
  y_pred = randomforest.predict(test_features)  
  from sklearn import metrics
  # --classification report --
  report = metrics.classification_report(test_labels, y_pred, labels=[0,1])
  print(report)
  return randomforest, y_pred, report

In [11]:
def create_baseline(dataframe, textcolumnname, labelcolumn):
  vec, dtm = create_dtm(dataframe, textcolumnname)
  rfmodel, predictions, report = randomforest(dtm, dataframe, labelcolumn)
  return vec, rfmodel, predictions, report

In [12]:
def predict_rf(rfmodel, dataframe, textcolumn, vec):
  featurevector = vec.transform(dataframe.iloc[:,textcolumn])
  y_pred = rfmodel.predict_proba(featurevector)  
  return y_pred

In [13]:
def predictions_df_rf(meeting, t, ground_truth, df):
  
  df = df.append({'Summary_ID': meeting + 1, 'Prediction': t, 'Ground Truth': ground_truth}, ignore_index=True)
  return df

In [14]:
def create_predictions_rf(meeting, dataframe, rfmodel, vec, dftest, summary_table, threshold):
  dftest  = dftest[dftest['Meeting ID'] == meeting + 1]
  dftest.iloc[:,3] = dftest.iloc[:,3].astype(str)

  predictions = predict_rf(rfmodel, dftest, 2, vec)
  
  #Create transcript based on predictions and threshold
  scores = ['none'] * len(predictions)
  labels = [0] * len(predictions)
  
  
  for i in range(len(predictions)):
    scores[i] = predictions[i][1]
    if scores[i] > threshold:
      labels[i] = 1
  
  dftest['Predicted'] = labels
  transcript = dftest.Transcript[dftest.Predicted == 1]
  
  #Concatenate all transcript sentences
  transcript = pd.DataFrame(transcript)
  t = ""
  
  for i in range(len(transcript)):
    t = t + str(transcript.iloc[i,0]) + " "
  
  #Grab original summary
  ami_df = table_to_df(summary_table)
  ground_truth = ami_df.iloc[meeting,3]
  dataframe = predictions_df(meeting, t, ground_truth, dataframe)
  return dataframe


In [15]:
def test_model_rf(rfmodel, dftest, reload, summary_table, start, stop, threshold, vec):
  if reload == True:
    dftest = load_test(summary_table, start, stop)
  
  df_prediction = pd.DataFrame(columns=['Summary_ID', 'Prediction', 'Ground Truth'])

  for i in range(start,stop):
    df_prediction = create_predictions_rf(i, df_prediction, rfmodel, vec, dftest, summary_table, threshold)
  
  hypotheses = df_prediction.iloc[:,1].values
  ground_truth = df_prediction.iloc[:,2].values
  
  log_array = calculate_rouge(hypotheses, ground_truth)
  
  return dftest, df_prediction, log_array