# Clean Dataset
This notebook takes all the downloaded data from Coursera and exracts just the course transcripts.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

## Input Course URL
The course id can be found in the hyperlink for any page in the course.

In [2]:
url = 'https://www.coursera.org/learn/siads697698/lecture/3vwIb/how-to-do-a-standup'
course = re.search('(?<=coursera.org/learn/)(\w+)', url).group(0)
lecture = url.split('/')[-1]

In [3]:
print(course)
print(lecture)

siads697698
how-to-do-a-standup


## Load Directory

In [60]:
directory = os.listdir('/Users/nicolascap/MADS/Capstone/intermediate_data')
new = True
for file in directory:
    if course in file:
        print("Course Already In Directory")
        new = False

Course Already In Directory


## Create List of Filepaths 

In [5]:
if new:
    directory = os.getcwd()
    filepaths=[]

    for subdir, dirs, files in os.walk(directory):
        for file in files:
            filepath = subdir + os.sep + file

            if filepath.endswith(".txt"):
                c = re.search('(?<=data/)(\w+)', filepath).group(0)
                if c == course:
                    filepaths.append(filepath)
    print(len(filepaths))
    print(filepaths[:5])
else:
    print("No need to run cell, files already in directory")

11
['/Users/nicolascap/MADS/Capstone/data/siads697698/04_week-4/03_office-hour-recordings/01_recording-of-elle-o-brien-office-hours-siads-697-698-on-22-07-19-08-00-31-02-07.en.txt', '/Users/nicolascap/MADS/Capstone/data/siads697698/02_week-2/03_office-hour-recording/01_recording-of-elle-o-brien-office-hours-siads-697-698-on-22-07-05-08-01-06-04-27.en.txt', '/Users/nicolascap/MADS/Capstone/data/siads697698/01_week-1/04_office-hour-recording/01_recording-of-elle-o-brien-office-hours-siads-697-698-on-22-06-28-07-57-42-03-17.en.txt', '/Users/nicolascap/MADS/Capstone/data/siads697698/01_week-1/02_videos/04_how-to-write-an-effective-blog-post.en.txt', '/Users/nicolascap/MADS/Capstone/data/siads697698/01_week-1/02_videos/05_how-to-do-a-standup.en.txt']


## Create Transcript DataFrame

In [6]:
if new:
    tup =[]
    for file_path in filepaths: 
        f = open(file_path,'r')
        A = f.read().replace('\n',' ')
        course_id = course
        video_title = file_path.split('/')[-1].split('.')[0][3:]
        if len(A) < 40000: #split long transcripts into two lines 
            tup.append((course_id, video_title, A))
        else: 
            tup.append((course_id, video_title, A[:len(A)//2]))
            tup.append((course_id, video_title, A[len(A)//2:]))

    df = pd.DataFrame(tup,columns=['course_id','video_title', 'transcripts'])
    df['length'] = df['transcripts'].str.len()
    df = df[df.length != 0] 
    df
else:
    print("No need to run cell, files already in directory")

## Save Dataset
We save the transcript dataset as a csv file for further analysis.

In [7]:
if new:
    df.to_csv("./intermediate_data/transcripts_{}.csv".format(course))
else:
    print("No need to run cell, files already in directory")

## Next step
After you saved the dataset here, run the next step in the workflow [3-TranscriptSummarization.ipynb](./3-TranscriptSummarization.ipynb) or go back to [0-Workflow.ipynb](./0-Workflow.ipynb).

---

**Authors:** [Wei Zhou](mailto:weiwzhou@umich.edu), [Nick Capaldini](mailto:nickcaps@umich.edu), University of Michigan, August 21, 2022

---