In [1]:
#Copied from fastai notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

#Importing data packages
import pandas as pd
import numpy as np

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

##Data Creation

In [3]:
def table_to_df(table_name):
  table = spark.sql("select * from " + table_name)
  df = table.toPandas()
  return df

In [4]:
def prep_df(df):
  df.iloc[:,2] = df.iloc[:,2].astype(str)
  df.iloc[:,3] = df.iloc[:,3].astype(str)
  df.loc[df.Label == 'neg', 'Label'] = 0
  df.loc[df.Label == 'pos', 'Label'] = 1
  df.iloc[:,3] = df.iloc[:,3].astype(int)
  return df

In [5]:
#Select transcript and corresponding summary of one meeting, split by sentence and make list
def isolate(row, df): 
  from nltk.tokenize import word_tokenize, sent_tokenize 
  transcript = sent_tokenize(df.iloc[row,1])
  summary = sent_tokenize(df.iloc[row,3])
  return transcript, summary

In [6]:
#Compare each transcript sentence with all summary sentences to find match
def Assign_Label(transcript, summary):
  value = 'neg'
  for j in range(len(summary)):
    if transcript == summary[j]:
      value = 'pos'
    else:
      continue
  return value

In [7]:
#function to preprocess and tokenize text files
import nltk 
WPT = nltk.WordPunctTokenizer()
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_word_list = nltk.corpus.stopwords.words('english')

def BOW(list):
    '''Remove numbers and special characters in sentence'''
    import re
    list = re.sub(" \d+", " ", list) #digits
    list = re.sub(r'[0-9]+', "",list) #digits
    list = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", list)
    pattern = r"[{}]".format("-_)(;:$%#.") #special characters
    list = re.sub(pattern, "", list) 
    list = re.sub(r"[\']", "",list)
    list = re.sub(r"[/']", "",list)
    list = re.sub(r'\b[a-zA-Z]\b', '', list) #remove single letter words
    list = re.sub('\s+', ' ', list).strip() #remove double spaces
    '''Lowercase'''
    list = list.lower()
    list = list.strip()
    '''Tokenize'''
    tokens = WPT.tokenize(list)
    filtered_tokens = [token for token in tokens if token not in stop_word_list]
    sentence_length = len(filtered_tokens)
    
    '''Lem'''
    k = []
    for word in range(len(filtered_tokens)):
        k.append(lemmatizer.lemmatize(filtered_tokens[word]))
    list = ' '.join(k)
    return list

In [8]:
#Calculate TF-IDF per unique word in one transcript and calculate the sum per sentence (TF-IDF on sentence level score)
def Assign_Score(tokenized):
  import numpy as np
  import sklearn
  from sklearn.feature_extraction.text import TfidfVectorizer
  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform(tokenized)
  X = X.sum(axis = 1)
  X = np.asarray(X)
  return X

In [9]:
#Calculate length of tokens in tokenized sentence
def Calculate_Len(sentence):
  sentence_length = word_tokenize(sentence)
  sentence_length = len(sentence_length)
  return sentence_length


In [10]:
#Add all collected values for 1 transcript to dataframe 
def Append_DF (df, m_id, transcript, label, score):
  import pandas as pd
  from sklearn import preprocessing
  print('Loading values for meeting ' + str(m_id))
  for i in range(len(transcript)):
    transcript2 = [BOW(item) for item in transcript]
    length = Calculate_Len(transcript2[i])
      
    df = df.append({'Meeting ID': m_id,'Transcript': transcript[i], 'Tokenized': transcript2[i],'Label': label[i],'Length': length, 'Score': score[i][0]}, ignore_index=True)

    #Print progress 
    if i == round((1/2 * len(transcript))):
        print('Loaded 1/2 of meeting ' + str(m_id))
    
    if i == ((len(transcript) - 1)):
      print('Loaded 2/2 of meeting ' + str(m_id))
      
  x = df[['Length']].values.astype(float)
  min_max_scaler = preprocessing.MinMaxScaler()
  length = min_max_scaler.fit_transform(x)
  for i in range(len(df)):
    df.iloc[i,4] = length[i][0]

  x = df[['Score']].values.astype(float)
  min_max_scaler = preprocessing.MinMaxScaler()
  score = min_max_scaler.fit_transform(x)
  for i in range(len(df)):
    df.iloc[i,5] = score[i][0]
    
  return df

In [11]:
#Loop over all transcript to collect values and return dataframe 
def load_sentences(df_original, df_new, start, stop):
  import pandas as pd
  for i in range(start, stop):
    transcript, summary = isolate(i, df_original)
    transcript2 = [BOW(item) for item in transcript]
    label = [Assign_Label(transcript, summary) for transcript in transcript]
    score = Assign_Score(transcript2)
    df_new = Append_DF(df_new, i + 1, transcript, label, score)
  print("Completed loading " + str(stop) + " meetings to dataframe " + str(df_new))
  return df_new