In [124]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import regex

In [125]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [126]:
df = pd.read_csv('/content/drive/MyDrive/RMP/scraped_comments.csv')

In [127]:
pd.options.display.max_colwidth = 100

In [128]:
df.head(5)

Unnamed: 0,comment_id,firstName,lastName,prof_class,comment,ratingTags,date,attendanceMandatory,grade,clarityRating,difficultyRating,helpfulRating,textbookUse,thumbsDownTotal,thumbsUpTotal,wouldTakeAgain
0,UmF0aW5nLTIyODAzODMy,Marty,Beans,FINITEMATH,Very nice and understanding. A lot of homework but she only grades two problems from it and thei...,,2014-01-29 16:17:28 +0000 UTC,Y,B,4,3,4,5.0,0,0,
1,UmF0aW5nLTE5NjU2ODY4,Marty,Beans,MTH106,"She is very helpful. Gives EC if you go to tutoring. Poor office hours, but willing to help afte...",,2012-01-03 02:19:53 +0000 UTC,,,4,2,4,5.0,0,0,
2,UmF0aW5nLTEyMTIwMDcz,Marty,Beans,MATHSUMFIN,she was nice. good job her,,2006-07-31 20:31:38 +0000 UTC,,,4,4,5,,0,0,
3,UmF0aW5nLTExMTc4MTMx,Marty,Beans,DEVELOPMATH,Professor Beans is one of the best Math teachers you will have. This is coming from someone who ...,,2005-12-15 23:07:38 +0000 UTC,,,5,1,5,,0,0,
4,UmF0aW5nLTEwMTkzMTcx,Marty,Beans,MATH100,"Big smile. Big goals. Nice lady. Bright and cheery. Sweet. Helpful. Loving. Addicted to Math, ...",,2005-08-31 21:17:16 +0000 UTC,N,A-,5,1,5,,0,0,


# Duplicates
Handling duplicates, made possible because of the unique comment_ids RMP provides.

There weren't many duplicates anyway, but it's still helpful to remove them.

In [129]:
print(df.shape)
df['comment_id'].value_counts() # Mistakes with scraping that led to duplicates

(4121664, 16)


UmF0aW5nLTMxOTczNDg=    2
UmF0aW5nLTExNTY0NzQ4    2
UmF0aW5nLTMxNjU1MTc0    2
UmF0aW5nLTEwOTYyODIz    2
UmF0aW5nLTM3OTkwNzk=    2
                       ..
UmF0aW5nLTIzMjgwMjQx    1
UmF0aW5nLTEyMDYwMzQ4    1
UmF0aW5nLTM2OTY4NTA=    1
UmF0aW5nLTI2MjI2MzYw    1
UmF0aW5nLTQxNTc5Ng==    1
Name: comment_id, Length: 4121613, dtype: int64

In [130]:
# Dropping duplicates
print("Size before dropping duplicates:", df.shape)
df.drop_duplicates(subset=['comment_id'], inplace=True)
print("Size after dropping duplicates:", df.shape)
df['comment_id'].value_counts()

Size before dropping duplicates: (4121664, 16)
Size after dropping duplicates: (4121613, 16)


UmF0aW5nLTIyODAzODMy    1
UmF0aW5nLTIwNzgxOTI4    1
UmF0aW5nLTIzNTYzMDky    1
UmF0aW5nLTIzNDk5MzY1    1
UmF0aW5nLTIzMzYxNTA3    1
                       ..
UmF0aW5nLTMwMzQ5NzU=    1
UmF0aW5nLTE2MDUwNzM1    1
UmF0aW5nLTE1NDQ5ODM2    1
UmF0aW5nLTE3NzcwODQ=    1
UmF0aW5nLTQxNTc5Ng==    1
Name: comment_id, Length: 4121613, dtype: int64

# Clarity Ratings
The metrics students use to rate a professor. In the past, could give floating point ratings but upon scraping only ints were gathered.

In [131]:
print("Number of null ratings:", df['clarityRating'].isna().sum()) # No missing ratings
df['clarityRating'].value_counts()

Number of null ratings: 0


 5    1686243
 4     835475
 1     639677
 3     508522
 2     451695
-1          1
Name: clarityRating, dtype: int64

In [132]:
df = df[df['clarityRating'] != -1]
df['clarityRating'].value_counts()

5    1686243
4     835475
1     639677
3     508522
2     451695
Name: clarityRating, dtype: int64

# Null values
Not only can a comment be empty, but sometimes RMP labels empty comments with "No comments". We consider both to be null values that are dropped from the dataset

In [133]:
print("Rows with empty comments:", df['comment'].isna().sum()) # Empty comments exist
print("Rows named \'No Comments\':", (df['comment'] == 'No Comments').sum())

Rows with empty comments: 7310
Rows named 'No Comments': 184430


In [134]:
df.dropna(subset=['comment'], inplace=True)
df = df[df['comment'] != 'No Comments']

In [135]:
print("Rows with empty comments:", df['comment'].isna().sum())
print("Rows named \'No Comments\':", (df['comment'] == 'No Comments').sum())

Rows with empty comments: 0
Rows named 'No Comments': 0


In [136]:
df.reset_index(drop=True, inplace=True)

# Swear words
We might want to get a count for how often these appear, or their word counts in the entire population

What might be interesting as well is viewing how these words coincide with the rating of the review

In [137]:

df_swear_comments = df[df['comment'].str.contains("(fuck|shit|asshole|dick|penis|cock|vagina|pussy|cunt|retard|bitch|slut|whore)")]
df_swear_comments.reset_index(drop=True, inplace=True)
print(df_swear_comments.shape)

  


(6371, 16)


In [138]:
df_swear_comments.iloc[15:20]

Unnamed: 0,comment_id,firstName,lastName,prof_class,comment,ratingTags,date,attendanceMandatory,grade,clarityRating,difficultyRating,helpfulRating,textbookUse,thumbsDownTotal,thumbsUpTotal,wouldTakeAgain
15,UmF0aW5nLTEwNTY4MTY1,Barb,Stengel,EDFN111,she's insane.\r\n better than the other retards over at stayer.,,2005-10-09 13:42:34 +0000 UTC,,,4,1,5,,0,0,
16,UmF0aW5nLTM0NjM2ODY=,Barb,Stengel,EDFN211,"Excellent, ten times better than all the other Stayer retards",,2005-03-16 14:10:04 +0000 UTC,,,5,1,5,,0,0,
17,UmF0aW5nLTEyMzg3ODA3,Joseph,Caspar,MAT110,U know the scene from Saw where the guy cuts out his own eye&#63; It's kinda like that. If you g...,,2006-11-09 15:10:56 +0000 UTC,,,1,5,1,5.0,0,0,
18,UmF0aW5nLTExNjQyMTc4,John,Kim,COMPLIT,"This guy is pretty cocky, but his intelligence is the key to getting a good grade! I did not buy...",,2006-04-01 02:58:02 +0000 UTC,,,5,1,3,,0,0,
19,UmF0aW5nLTEwOTU0MDA4,Bessma,Momani,HIST130,"I had her last year. She was a great prof. Funny, good lectures, puts notes on reserve at librar...",,2005-11-19 16:08:46 +0000 UTC,Y,Fail,5,2,5,,0,0,


In [139]:
list_of_comments_with_swears = list(df_swear_comments['comment'])
for comment in list_of_comments_with_swears[15:20]:
  print(comment + '\n')

she's insane.
 better than the other retards over at stayer.

Excellent, ten times better than all the other Stayer retards

U know the scene from Saw where the guy cuts out his own eye&#63; It's kinda like that. If you get him...kill yourself. He's a living corpse who relies on ur stress for sustenance.  Every day he reminds the class they are basicly retarded, and yells at u if u don't understand the lesson.  If u can, kill someone so u can get their spot in another class.

This guy is pretty cocky, but his intelligence is the key to getting a good grade! I did not buy any of the books he told us to.. I just went to every lecture and took notes on his reviews. I got a decent grade... As for future students, buy the books and read the assignments.. you have to finish them in like a week.. not much time, but for an A, y

I had her last year. She was a great prof. Funny, good lectures, puts notes on reserve at library, and her test are good. If anyone doesn't like her i imagine she mad

In [140]:
df_swear_comments['clarityRating'].value_counts()

5    1695
1    1658
4    1077
2     999
3     942
Name: clarityRating, dtype: int64

# HTML-Entities
In the reviews, sometimes characters are encoded in the format '&amp;quot;' or '&amp;#123;' (ironic that to properly show the raw representation I had to use an html-entitiy myself)

These must be cleaned before tokenization or even more eda steps

Using regex to check against entities from https://www.freeformatter.com/html-entities.html

In [141]:
entities_broad = df[df['comment'].str.contains('&[0-9a-zA-Z#]+;')]
entities_broad.shape

(180668, 16)

In [142]:
df['comment'].replace('&([a-zA-z]+|#\d+|[a-zA-Z0-9]{3,});', '', inplace=True, regex=True)
df['comment'].replace('&#63;?', '', inplace=True, regex=True) # Specifically to deal with strange question mark remnants
# df_html_entities = df[df['comment'].str.contains('&([a-zA-z]+|#\d+);')]

In [143]:
entities_broad = df[df['comment'].str.contains('&[0-9a-zA-Z#]+;')]
entities_broad.shape

(1, 16)

In [144]:
entities_broad.reset_index(inplace=True, drop=True)
for comment in entities_broad['comment']:
  print(comment)

Great teacher. His tests are easy if you study his notes.  The midterm isn't written by him, and the final is a nationally written exam that covers chem 1&2; so be prepared for those to drop your grade.


# Links, Phone-Numbers, Email-Addresses

In [145]:
link_inds = df['comment'].str.contains('\s*https?://\S+(\s+|$)')
phone_inds = df['comment'].str.contains('^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$')
email_inds = df['comment'].str.contains('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
print("Number of links:", df[link_inds].shape)
print("Number of phone-numbers", df[phone_inds].shape)
print("Number of email-addresses", df[email_inds].shape)

  """Entry point for launching an IPython kernel.
  


Number of links: (156, 16)
Number of phone-numbers (0, 16)
Number of email-addresses (250, 16)


In [146]:
df.loc[link_inds, 'comment']

1122       Needs to be treated for Narcissistic Personality Disorder.\r\n Symptoms include:\r\n -Disregard ...
32136      Hated this class. It was so boring and his lectures in class do not even relate to the online te...
34864      http://wps.prenhall.com/esm_audesirk_bloe_7/0,8753,1139971-,00.html - This website could outteac...
37038      If you are taking the American Century course, beware! Its some kind of propaganda, since Corke ...
57339      Please avoid at all costs. Take with Math Dept. The material of the course is not all that diffi...
                                                          ...                                                 
3911623    He's a very nice guy. Tells jokes. And he has a very useful web site                            ...
3917853    I personally enjoyed this class. Yes, his lectures may be mono-toned and fast paced but he makes...
3922972    Please see http://www.ratemyprofessors.com/ShowRatings.jsptid=55852 for Professor Ferrer's ratin...
3

In [168]:
# For some reason, using df.loc[link_inds, 'comment'] wouldn't replace
df['comment'].replace(r'\s*https?://\S+(\s+|$)', ' ', inplace = True, regex = True)
df['comment'].replace('^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$', ' ', inplace = True, regex = True)
df['comment'].replace('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', inplace = True, regex = True)

In [171]:
print("Number of links:", df[link_inds]['comment'].str.contains('\s*https?://\S+(\s+|$)').shape)
print("Number of phone-numbers", df[phone_inds]['comment'].str.contains('^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$').shape)
print("Number of email-addresses", df[email_inds]['comment'].str.contains('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+').shape)

Number of links: (0,)
Number of phone-numbers (0,)
Number of email-addresses (0,)


  """Entry point for launching an IPython kernel.
  


# Symbols
Normally would just be cleaned, but because we're dealing with grades a lot percentages might be useful. 

In [149]:
print(df.shape)
df_symbol_comments = df[df['comment'].str.contains("[*#@%$=_&\^/<>]+")]
print(df_symbol_comments.shape)

(3929872, 16)
(361930, 16)


### Ampersand (&)
Dealing with ampersands, which are very common as a substitute for "and"



In [150]:
# First, shrinking multiple occurences of & in a row -> single occurence
df['comment'].replace('&+', '&', inplace=True, regex=True)

In [151]:
df_amp_easy_comments = df[df['comment'].str.contains(" & ")][['comment']] # Easy case with the &'s, comments use it as substitute for 'and'
df_amp_easy_comments.shape

(87917, 1)

In [152]:
amp_contexts = df_amp_easy_comments['comment'].str.extract(r'(?P<context>.{1,19} & .{1,19})')
amp_contexts.reset_index(inplace=True, drop=True)

In [153]:
for context in amp_contexts.loc[:20, 'context']:
  print(context)

le to work with him &  study efficiently 
s is extremely nice & helpful in her offi
erman was very nice & helpful. I learned 
ll have you at ease & speaking espanol in
rs before the test, & know the study guid
class.2 online test & one final group pro
r in my night class & I've got to say...h
 them up. Open book & Open note exams, wi
ries to make it fun & keep you interested
ries to make it fun & keep you interested
gives great midterm & final reviews so ma
s gramatical errors & focuses less on con
In o92 & 096 I had straight 
ns. He's very blunt & to the point. He he
r. Hixson (Journ220 & Journ227) and loved
 lost. Her lectures & materials on Blackb
s, take notes, read & its a fairly easy A
y. Read the chapter & u'll pass. Beware o
Circuits I & II Professor
other classes on BB & not had these probl
re than English 101 & 102!


In [154]:
df['comment'].replace(' & ', ' ', inplace=True, regex=True)

In [155]:
df_amp_sub = df[df['comment'].str.contains(' & ')]
df_amp_sub.shape

(0, 16)

In [156]:
df_amp_med_comments = df[df['comment'].str.contains("[a-zA-Z]& | &[a-zA-Z]")][['comment']] # Tricky case where comments don't have proper spacing
df_amp_med_comments.shape

(2488, 1)

In [157]:
df['comment'].replace('([a-zA-Z])& ', r'\1 ', inplace=True, regex=True)
df['comment'].replace(' &([a-zA-Z])', r' \1', inplace=True, regex=True)

In [158]:
df_amp_med_comments = df[df['comment'].str.contains("[a-zA-Z]& | &[a-zA-Z]")][['comment']] # Tricky case where comments don't have proper spacing
df_amp_med_comments.shape

(0, 1)

In [159]:
common_amp_abbreviations = df['comment'].str.extract('(?P<abbrev>[a-zA-Z]&[a-zA-Z])')
common_amp_abbreviations.shape

(3929872, 1)

In [160]:
common_amp_abbreviations['abbrev'].str.lower().value_counts().head(50)

a&p    3813
q&a     298
w&m     120
a&m     106
a&b     106
m&f      98
e&m      92
m&m      85
w&l      80
s&t      79
e&e      77
f&m      75
t&f      65
d&c      64
b&w      58
e&h      55
s&q      50
s&a      48
s&s      46
s&e      44
t&t      42
i&i      42
s&p      40
s&c      35
s&b      34
s&d      32
d&d      31
s&h      31
s&m      30
s&l      29
m&a      29
c&i      29
c&t      29
e&s      28
y&p      27
r&w      27
s&i      26
t&c      26
e&c      25
e&f      25
s&n      25
e&t      24
s&f      23
a&c      23
t&p      23
r&r      23
d&f      22
e&i      22
w&j      22
r&d      22
Name: abbrev, dtype: int64

In [161]:
amp_contexts = df[df['comment'].str.contains(r'[a-zA-Z]&[a-zA-Z]')]

In [162]:
print(amp_contexts.shape)
amp_contexts.reset_index(inplace=True, drop=True)
for comment in amp_contexts.loc[:50, 'comment']:
  print(comment)

(8604, 16)
Its a hard class and well he dosent do much, the T&Q are easy if you do the homework. just do the home work like 2 or 3 times and you will pass the class
He doesn't teach the material and doesn't review for the exams he gives. You would be better off taking a different professor. I read the books and studied nonstop and still only scraped a C. I aced the lab portion of it, and that was the reason for the C. I aced microbiology and A&P before I even took his class. Find someone else.
I took this professor for Biology 1406. I looked up which professor to take for my A&P class and when his name popped up, it gave me a flash of PTSD. His lectures had absolutely nothing to do with the materials on the tests. I could have skipped the whole semester, read the textbook by myself and could have gotten the same grade as I received.
Do you really need to pass your a&p with a good grade after so much effort? Then do yourself a favor by not taking the class with him. Please, please and p

### Percent (%)

### Slashes (/ and \\)
- Used to shorten text (w/ standing for with).
- Used to represent a fraction, might be able to replace with "out of"
- Used to abbreivate "or" 

Interestingly, both slashes are sometimes used to express emoticons (:\ or :/)

In [163]:
regular_slashes = df[df['comment'].str.contains('/')]
back_slashes = df[df['comment'].str.contains(r'\\')]
print("Size of regular slashes:", regular_slashes.shape)
print("Size of back slashes:", back_slashes.shape)

Size of regular slashes: (157772, 16)
Size of back slashes: (40, 16)


In [164]:

for comment in back_slashes['comment']:
  print(comment)

Got a C+ in his class after studying like crazy for every test. Worked harder for this stupid wellness class than for all my others and still ended up bringing my gpa down. DONT take the 7:30 class. He is \ a stickler about attendance which is annoying. Not willing to meet with students outside of class and arrogant. Nice enough man but dont take
Nice guy really funny kind of Per\/  but its awesome. Great field trips but tests are hardcore way hard
Dr. Lamourelle's blackboard site is extremely unorganized. Her guidelines for writing essays, and projects are the same. She did respond to all of my emails and gave out her phone number to whoever needed it, but even her responses back were confusing. One time she didn't know where to turn in an assignment. :\. If you can keep away, KEEP AWAY!
Really good at explaining things. Seemed like a nice guy. Thought he did an all-around better job of things than Brenda Gunderson (who everyone else seems to recommend :\).
Most of the ES105 students 

In [165]:
i = 0
for comment in regular_slashes['comment']:
  print(comment)
  i += 1
  if i == 25: break

This guy is awesome.  I totally failed my first quiz and still got a B+ in the class b/c I was able to work with him  study efficiently with his guidance. Go to review sessions! Ask lots of questions! He gives great / hilarious stories in class  as examples. You will work very hard! But you'll be rewarded in the end!
nice person, terrible instructor. she know barely anything about photographic processes/ digital imaging and honestly has no right accepting huge sums of tuition to do so. if you take her class you will learn nothing about the stated subject matter of the class. as a now working professional with hindsight my advice would be, avoid at all costs.
It's funny that people would say she's tough. If you attend class, skim the chaptrs before the test, know the study guides, you'll do fine.  She even has review days. She SAYS 1/2 the test q's come from the book, but as the semester wore on, most were from the lecture. Note: She's very serious about punishing cheaters and talks fem

# Numbers

In [192]:
profs = df.dropna(subset='prof_class')
profs = profs[profs['prof_class'].str.contains('\d{1,3}')]

TypeError: ignored

In [None]:
profs['prof_class']

In [176]:
df_nums = df['comment'].str.extractall('(?P<number>\d+)')

In [183]:
print(df_nums.loc[27])
df.loc[27, 'comment']

      number
match       
0         63
1         40


"To all the people that said she is a good teacher you are insane.  First test, the average was 63 and first quiz the average was 40. She didn't curve it at all and didn't understand why we would want one.  What she teaches is so simplistic compared to what she puts on the test. Good luck because she will probably be the reason you fail out..."

# Observations
* There are duplicates in the comments, easy to drop though
* Professor id was not scraped, **NEEDS TO BE DONE IN FUTURE SCRAPES**
* Clarity rating rounds down
* Clarity ratings skew positve, then heavily negative (5, 4, then 1)
* Some comments are null, and some are null with the text "No Comments".
* There are swear words in some of the reviews, though they may not have been admin reviewed. **Grab this feature when scraping**
* Comments sometimes mention their grade, sometimes enclosed in quotes. While this isn't accounted for right now, should be in the future
* Classes have abbreviations, will be hard to capture true class
* Emojis are filtered out, but how to deal with emoticons like :)?

# TODO


* Have to figure out a way to find reviews that contain words that our models don't have (slang words)
* How to deal with numbers (0-9) and normalize them; might need to just completely ignore them for now
* Abbreviations like w/ need to be expanded
* ~~**Deal with html entities, either by parsing them or removing them**~~
* Common abbreviations using & need to be dealt with (slightly challenging, because it might be like q&a which means question and answer, or & in a university name like A&M)