In [5]:
import numpy
import urllib.request
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
from nltk.corpus import stopwords

class KeywordCount:
    def __init__(self, fname):
        self.name = fname  #file name of input transcript
        #init for updateWordCount()
        self.words = []  #sequenced word list
        self.wordCount = defaultdict(int)  #dictionary of words and their count
        self.wordCountSort = defaultdict(int)  #sorted dictionary
        self.transSize = 0  #words the transcript have
        self.transSizeNS = 0  #meaningful words the transcript have
        self.wordsetSize = 0  #all different words
        #init for updateKeywordStatistics()
        self.keywordStat = defaultdict(dict)  #num(int) certain keyword appears in the transcript
        self.keywordLoc = defaultdict(dict)  #locations(list) certain keyword appears in the transcript
        self.keywordPart = defaultdict(dict)  #percentage per keyword from all keywords
        self.keywordPer = float(0)  #percentage all keywords from all words
        #init for questionMark()
        self.questionStat = 0
        self.questionLoc = []
        self.questionPer = float(0)

        print("Reading data...")
        print("Parsing by words...")
        with open(fname, encoding='utf-8', errors='ignore') as f:
            data = f.read()
        tokenizer = nltk.tokenize.TreebankWordTokenizer()
        self.data = tokenizer.tokenize(data)
        print("Parsing by sentences...")
        from nltk.tokenize import sent_tokenize
        self.sentNum = 0 
        self.sents = []
        with open(fname, encoding='utf-8', errors='ignore') as f:
            for line in f:
                line = line.lower()
                self.sentNum += len(sent_tokenize(line))
                for sent in sent_tokenize(line):
                    self.sents.append(sent)

        print("done")

        print("Initializing keword dictionary...")
        # Dictionary of key-terms for CTS fidelity
        self.keywordCTS = defaultdict(set)

        self.keywordCTS['Agenda'] = {'agenda','priorities','most important','focus on first','talk about today','work on today','focus on during the session','work on' \
                                ,'you like to add to the agenda','you like to add anything to the agenda','you want to add to the agenda' \
                                ,'you want to add anything to the agenda','last week'}

        self.keywordCTS['Feedback'] = {'feedback','previous','last time','last week','last session','past session' \
                                  ,'think about today','things go today','think about today\'s session','concern','unhelpful','helpful' \
                                  ,'anything i can do better','anything we can do better','concerns about today\'s session','helpful about the session' \
                                  ,'learn','skills','achieve','goals','if i understand you correctly', 'are you saying', 'do i have it right'}

        self.keywordCTS['Understanding'] = {'understand','understanding','sounds like','you are saying','you are feeling','you were feeling','you felt' \
                                       ,'see','makes sense','i see','feel that way','feel this way'}

        self.keywordCTS['Interpersonal Effectiveness'] = {'sorry','hard','difficult','tough' \
                                                     ,'dissappointing','stressful','stressed' \
                                                     ,'scary','frightening','upset','upsetting'\
                                                     ,'unfortunate'}

        self.keywordCTS['Collaboration'] = {'choice', 'you want to do','good idea','because','will','help you get your goal'}

        self.keywordCTS['Guided Discovery'] = {'meaning','mean','self','how','why','evidence' \
                                          ,'conclusion','conclude','decide','decision','decided' \
                                          ,'know','proof','tell me more','assume','assumption' \
                                          ,'hypothesis','disprove','facts','fact','solutions' \
                                          ,'brainstorm','solve','alternative','other explanations' \
                                          ,'another way','other way','to think about','to explain','reason'} 

        self.keywordCTS['Focus on Key Cognitions'] = {'thinking','tell yourself','through your mind' \
                                                 ,'thought','think','connection','lead to','connected' \
                                                 ,'connect','link','linked','make you','you do'}

        self.keywordCTS['Choices of Intervention'] = {}

        self.keywordCTS['Homework'] = {'homework','review','at home','practice','assignment','assign' \
                                  ,'assigned','progress','learned','improve','learn','skills' \
                                  ,'goal','better','barrier','in the way','expect','problems','succeed','success'}

        self.keywordCTS['Social Skills Training'] = {'rational','help you learn this skill','help you with your goal' \
                                                ,'demonstrate','to make your next role','play better','play even better','try to focus on'}
                                                           

    def updateWordCount(self):
        print('Starting counting words...')
        stopWords = set(stopwords.words("english"))
        # Ignore capitalization and remove punctuation
        punctuation = set(string.punctuation)
        stemmer = PorterStemmer()
        for d in self.data:
            r = ''.join([c for c in d if not c in punctuation])
            if r != '':
                w = stemmer.stem(r.lower())
                self.words.append(w)
                self.transSize += 1
                if not w in stopWords:
                    self.wordCount[w] += 1
                    self.transSizeNS += 1
        self.wordsetSize = len(self.wordCount)
        self.wordCountSort = sorted(self.wordCount.items(), key = lambda kv: kv[1])
        print('Word counting done.')

    def updateKeywordStatistics(self):
        print('Starting keyword statisics...')
        s = 0
        for k1 in self.keywordCTS.keys():
            for k2 in self.keywordCTS[k1]:
                self.keywordStat[k1][k2] = 0
                if len(k2.split()) == 1:
                    if k2 in self.wordCount.keys():
                        self.keywordStat[k1][k2] = self.wordCount[k2]
                        s += self.wordCount[k2]
                        loc = []
                        cap = len(self.words)
                        for i in range(cap):
                            if self.words[i] == k2:
                                loc.append(float(i)/float(cap))
                        self.keywordLoc[k1][k2] = loc
                else: 
                    loc = []
                    for i in range(self.sentNum):
                        if k2 in self.sents[i]:
                            self.keywordStat[k1][k2] += self.sents[i].count(k2)
                            s += self.sents[i].count(k2)
                            for j in range(self.sents[i].count(k2)):
                                loc.append(float(i)/float(self.sentNum))
                    if self.keywordStat[k1][k2] > 0:
                        self.keywordLoc[k1][k2] = loc
        for k1 in self.keywordStat.keys():
            for k2 in self.keywordStat[k1].keys():
                self.keywordPart[k1][k2] = self.keywordStat[k1][k2]/float(s)
        self.keywordPer = s/float(self.transSize)
        print('Keyword statistics done.')

    def questionMark(self):
        print('Starting questionmark statistics...')
        l = len(self.data)
        for i in range(l):
            if self.data[i] == '?':
                self.questionStat += 1
                self.questionLoc.append(float(i)/float(l))
        self.questionPer = float(self.questionStat)/float(self.sentNum)
        print('Questionmark statistics done.')


In [12]:
t1 = KeywordCount('Transcript1.txt')
t1.updateWordCount()
t1.updateKeywordStatistics()
t1.questionMark()
f = open('Feature'+t1.name,'w')
f.write('File name: ' + t1.name + '\n')
f.write('========================================================================================' + '\n')
f.write('Object: ' + 'Overall' + '\n')
f.write('========================================================================================' + '\n')
f.write('Keyword Dictionary:' + '\n')
f.write(str(t1.keywordCTS))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('========================================================================================' + '\n')
f.write('========================================================================================' + '\n')
f.write('Feature 1: times(int) certain keyword appears in the transcript ' + '\n')
f.write(str(t1.keywordStat))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('Feature 2: locations(list of floats) certain keyword appears in the transcrip' + '\n')
f.write(str(t1.keywordLoc))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('Feature 3: percentage per keyword from all keywords' + '\n')
f.write(str(t1.keywordPart))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('Feature 4: percentage of all keywords from all words' + '\n')
f.write(str(t1.keywordPer))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('Feature 5: questionmarks count' + '\n')
f.write(str(t1.questionStat))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('Feature 6: questionmarks location' + '\n')
f.write(str(t1.questionLoc))
f.write('\n')
f.write('========================================================================================' + '\n')
f.write('Feature 7: percentage of sentences ending with questionmark from all sentences' + '\n')
f.write(str(t1.questionPer))
f.write('\n')
f.close()

Reading data...
Parsing by words...
Parsing by sentences...
done
Initializing keword dictionary...
Starting counting words...
Word counting done.
Starting keyword statisics...
Keyword statistics done.
Starting questionmark statistics...
Questionmark statistics done.
