In [1]:
import csv
import pandas as pd
import numpy as np

## 1.Load in data

In [2]:
df = pd.read_csv('6011_reviews.csv')
del df['tag_list']
df.shape

(6010, 13)

In [3]:
df.head()

Unnamed: 0,name,department,overall_quality,would_take_again_%,difficulty_score,date,for_credit,attendance,textbook_used,would_take_again,grade_received,comment,rating
0,"Bhati, Karni",English,2.7,40%,3.0,12/26/2016,Yes,Not Mandatory,Yes,Yes,A,Dr. Bhati is a nice guy really and very friend...,AWESOME
1,"Bhati, Karni",English,2.7,40%,3.0,11/28/2016,Yes,Mandatory,No,No,A,"He is a nice, smart man, who is passionate abo...",AWFUL
2,"Bhati, Karni",English,2.7,40%,3.0,11/28/2016,Yes,Mandatory,Yes,Yes,,"Dr. Bhati is such a sweet person, but his teac...",AVERAGE
3,"Bhati, Karni",English,2.7,40%,3.0,11/07/2016,Yes,Mandatory,Yes,No,A,"He's a nice man, but his teaching was not very...",AVERAGE
4,"Bhati, Karni",English,2.7,40%,3.0,10/26/2016,Yes,,Yes,No,Not sure yet,Dr. Bhati is a friendly and interesting profes...,POOR


## 2. Pre-processing and Data Cleaning

In [4]:
# print the unique values of each nominal attribute
for column in df:
    if column in ['department', 
                  'for_credit', 
                  'attendance', 
                  'textbook_used', 
                  'would_take_again', 
                  'grade_received', 
                  'rating']:
        print(column)
        print(df[column].unique())

department
['English' 'Mathematics' 'Biology' 'Theology' 'Science' 'Economics'
 'Philosophy' 'Marketing' 'Chemistry' 'History' 'Languages'
 'Social Science' 'Physical Ed' 'Psychology' 'Health Science'
 'Political Science' 'Theater' 'Classics' 'Religion' 'Physics'
 'Anthropology' 'Music' 'Fine Arts' 'Geology' 'Communication'
 'Computer Science' 'Business']
for_credit
['Yes' nan 'No']
attendance
['Not Mandatory' 'Mandatory' nan]
textbook_used
['Yes' 'No' nan]
would_take_again
['Yes' 'No' nan]
grade_received
['A' nan 'Not sure yet' 'B' 'D' 'D+' 'A-' 'WD' 'B+' 'B-' 'C' 'C+' 'A+'
 'D-' 'INC' 'C-' 'F']
rating
['AWESOME' 'AWFUL' 'AVERAGE' 'POOR' 'GOOD']


In [5]:
# Anonymize the dataset to make it a shadow dataset with numbers
name_list = df['name'].unique()

for number, name in enumerate(name_list):
    df.loc[df['name'] == name, 'name'] = number

# Change the name of the attribute to 'professor'
df = df.rename(columns={'name':'professor'})
df.head(1)

Unnamed: 0,professor,department,overall_quality,would_take_again_%,difficulty_score,date,for_credit,attendance,textbook_used,would_take_again,grade_received,comment,rating
0,0,English,2.7,40%,3.0,12/26/2016,Yes,Not Mandatory,Yes,Yes,A,Dr. Bhati is a nice guy really and very friend...,AWESOME


## 3. Gender Detection

In [6]:
num_reviews_per_prof = dict(df['professor'].value_counts())
prof_list = num_reviews_per_prof.keys()

In [7]:
import re

# We will detect the gender of the professor based on the reviews submitted by students,
# since RMP does not provide that information
male_list = ['he', 'his', 'him', 'himself']
female_list = ['she', 'her', 'hers', 'herself']

gender = []
for prof in prof_list:
    # counters for male pronuns and female pronouns
    mcount, fcount = 0, 0
    for index, comment in df.loc[df['professor'] == prof][['comment']].iterrows():
        # Using regex to find all occurences of each pronoun and increment the counters
        for word in male_list:
            mcount += sum(1 for _ in re.finditer(r'\b%s\b' % word, comment[0]))
        for word in female_list:
            fcount += sum(1 for _ in re.finditer(r'\b%s\b' % word, comment[0]))
    # Create the list of pronouns corresponding to each professor
    if mcount > fcount:
        gender.extend(['male'] * num_reviews_per_prof[prof])
    else:
        gender.extend(['female'] * num_reviews_per_prof[prof])

# Append the gender list to the DataFrame
df.insert(loc=2, column='gender', value=gender)

In [8]:
df.to_csv('reviews.csv', index=False)

## 4. Polarity and subjectivity

In [9]:
from textblob import TextBlob
# We will use the built-in Sentiment Analysis of TextBlob package

def polarity(text):
    return TextBlob(text).sentiment.polarity

def subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

for index, comment in df[['comment']].iterrows():
    #print(comment[0])
    df.at[index,'polarity'] = polarity(comment[0])
    df.at[index, 'subjectivity'] = subjectivity(comment[0])

In [10]:
df.head()

Unnamed: 0,professor,department,gender,overall_quality,would_take_again_%,difficulty_score,date,for_credit,attendance,textbook_used,would_take_again,grade_received,comment,rating,polarity,subjectivity
0,0,English,male,2.7,40%,3.0,12/26/2016,Yes,Not Mandatory,Yes,Yes,A,Dr. Bhati is a nice guy really and very friend...,AWESOME,0.321875,0.4625
1,0,English,male,2.7,40%,3.0,11/28/2016,Yes,Mandatory,No,No,A,"He is a nice, smart man, who is passionate abo...",AWFUL,0.245536,0.747718
2,0,English,male,2.7,40%,3.0,11/28/2016,Yes,Mandatory,Yes,Yes,,"Dr. Bhati is such a sweet person, but his teac...",AVERAGE,0.1725,0.45
3,0,English,male,2.7,40%,3.0,11/07/2016,Yes,Mandatory,Yes,No,A,"He's a nice man, but his teaching was not very...",AVERAGE,0.132993,0.464796
4,0,English,male,2.7,40%,3.0,10/26/2016,Yes,,Yes,No,Not sure yet,Dr. Bhati is a friendly and interesting profes...,POOR,0.185,0.51


In [None]:
df.to_csv('reviews.csv', index=False)

## 5. Comment length

In [12]:
comment = df.at[0,'comment']
txt = TextBlob(comment)
len(txt.words)

60

In [13]:
def len_words(comment):
    return len(TextBlob(comment).words)

df['comment'] = df['comment'].map(lambda x: len_words(x))
df = df.rename(columns={'comment':'comment length'})

In [14]:
df.head()

Unnamed: 0,professor,department,gender,overall_quality,would_take_again_%,difficulty_score,date,for_credit,attendance,textbook_used,would_take_again,grade_received,comment length,rating,polarity,subjectivity
0,0,English,male,2.7,40%,3.0,12/26/2016,Yes,Not Mandatory,Yes,Yes,A,60,AWESOME,0.321875,0.4625
1,0,English,male,2.7,40%,3.0,11/28/2016,Yes,Mandatory,No,No,A,67,AWFUL,0.245536,0.747718
2,0,English,male,2.7,40%,3.0,11/28/2016,Yes,Mandatory,Yes,Yes,,35,AVERAGE,0.1725,0.45
3,0,English,male,2.7,40%,3.0,11/07/2016,Yes,Mandatory,Yes,No,A,43,AVERAGE,0.132993,0.464796
4,0,English,male,2.7,40%,3.0,10/26/2016,Yes,,Yes,No,Not sure yet,57,POOR,0.185,0.51


In [15]:
df.to_csv('reviews.csv', index=False)

## 6. Other