# Mount Google drive

In [27]:
import os
from google.colab import drive

In [28]:
# # Change the default root directory for ugur
# drive.mount('/content/drive')
# os.chdir("drive/MyDrive/Final-Project")

In [29]:
# Change the default root directory for mert
drive.mount('/gdrive')
os.chdir("/gdrive/My Drive/Notes/NLP/Final-Project/")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Import the libraries

In [30]:
import pandas as pd
import numpy as np

In [31]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Read the dataset

In [32]:
dataset = 'dataset/C50/C50Train.csv'

In [33]:
df = pd.read_csv(dataset)

In [34]:
grouped = df.groupby("author")["texts"].apply(list)

In [35]:
grouped.iloc[0][0]

'The U.S. Postal Service announced Wednesday a plan to boost online commerce by enhancing the security and reliability of electronic mail traveling on the Internet.Under the plan, businesses and consumers can verify that e-mail has not been tampered with and use services now available for ordinary mail like sending a certified letter."The leap from trading messages to buying and selling goods has been blocked by the fear of security threats," Robert Reisner, vice president of stategic planning, said.  "To expand from local area networks and bilateral secure communications to wide use of electronic commerce will require a new generation of security services," Reisner said.Cylink Corp is developing a system for the Post Office to use to verify the identity of e-mail senders. The system will enable people to register a digital "signature" with the Post Office that can be compared against electronic mail they send.If any tampering is discovered, the Postal Service would investigate, just l

# Preprocessing

In [36]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [37]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [38]:
!pip install text-preprocessing
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from text_preprocessing import preprocess_text, remove_stopword
from text_preprocessing import to_lower, remove_email, remove_url, lemmatize_word, remove_punctuation, check_spelling, expand_contraction, remove_name, remove_number, remove_special_character, remove_punctuation, remove_whitespace, normalize_unicode, remove_stopword, preprocess_text

def clean_text(text, remove_stopwords=True):
  preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, check_spelling, expand_contraction, remove_name, remove_stopword]
  # text = preprocess_text(text, preprocess_functions)

  # Convert words to lower case
  text = text.lower()
  
  # Replace contractions with their longer forms 
  if True:
      text = text.split()
      new_text = []
      for word in text:
          if word in contractions:
              new_text.append(contractions[word])
          else:
              new_text.append(word)
      text = " ".join(new_text)
  
  # Format words and remove unwanted characters
  text = re.sub(r'&amp;', '', text) 
  text = re.sub(r'0,0', '00', text) 
  text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
  text = re.sub(r'\'', ' ', text)
  text = re.sub(r'\$', ' $ ', text)
  text = re.sub(r'j k ', ' jk ', text)
  text = re.sub(r' s ', ' ', text)
  text = re.sub(r' yr ', ' year ', text)
  text = re.sub(r' l g b t ', ' lgbt ', text)
  #text = re.sub(r' ', '', text)
  
  # Optionally, remove stop words
  if remove_stopwords:
      text = text.split()
      stops = set(stopwords.words("english"))
      text = [w for w in text if not w in stops]
      text = " ".join(text)
  return text



In [39]:
stemmer = PorterStemmer()

In [40]:
df["texts"] = df["texts"].apply(lambda x: clean_text(x))

In [41]:
df["texts"]

0       u postal service announced wednesday plan boos...
1       internet may overflowing new technology crime ...
2       growing business business internet poses major...
3       computer scientist barred exporting floppy dis...
4       international task force working resolve simme...
                              ...                        
2495    communist party chief jiang zemin put personal...
2496    china gave new hints monday three year austeri...
2497    china vowed wind lengthy probe disgraced forme...
2498    china sent mixed signals united states visit s...
2499    china taken cue u federal reserve chief alan g...
Name: texts, Length: 2500, dtype: object

In [42]:
%cd "dataset/C50"

/gdrive/My Drive/Notes/NLP/Final-Project/dataset/C50


In [43]:
df = pd.read_csv("cleaned.csv")
df

Unnamed: 0,author,text
0,AaronPressman,postal service announced wednesday plan boost ...
1,AaronPressman,internet overflowing technology crime cyberspa...
2,AaronPressman,growing business business internet pose major ...
3,AaronPressman,computer scientist barred exporting floppy dis...
4,AaronPressman,international task force working resolve contr...
...,...,...
2495,WilliamKazer,communist party chief zemin ha put personal st...
2496,WilliamKazer,china gave hint monday three year austerity pr...
2497,WilliamKazer,china ha vowed wind lengthy probe disgraced fo...
2498,WilliamKazer,china ha sent mixed signal united state visit ...


# Train and Apply Model

In [44]:
df

Unnamed: 0,author,text
0,AaronPressman,postal service announced wednesday plan boost ...
1,AaronPressman,internet overflowing technology crime cyberspa...
2,AaronPressman,growing business business internet pose major ...
3,AaronPressman,computer scientist barred exporting floppy dis...
4,AaronPressman,international task force working resolve contr...
...,...,...
2495,WilliamKazer,communist party chief zemin ha put personal st...
2496,WilliamKazer,china gave hint monday three year austerity pr...
2497,WilliamKazer,china ha vowed wind lengthy probe disgraced fo...
2498,WilliamKazer,china ha sent mixed signal united state visit ...


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
dfTrain = pd.read_csv("C50Train.csv")
dfTest = pd.read_csv("C50test.csv")
x_train, y_train = dfTrain["texts"],dfTrain["author"]
x_test,y_test = dfTest["text"],dfTest["author"]

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier 

#LinearSVC .68
# Multinomial NB .63
#Random Forest .65
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC() ),
])
text_clf.fit(x_train, y_train)

predicted = text_clf.predict(x_test)
print(classification_report(y_test, predicted))

                   precision    recall  f1-score   support

    AaronPressman       0.87      0.96      0.91        50
       AlanCrosby       0.84      0.54      0.66        50
   AlexanderSmith       0.47      0.32      0.38        50
  BenjaminKangLim       0.31      0.24      0.27        50
    BernardHickey       0.89      0.66      0.76        50
      BradDorfman       0.68      0.94      0.79        50
 DarrenSchuettler       0.41      0.28      0.33        50
      DavidLawder       0.64      0.50      0.56        50
    EdnaFernandes       0.96      0.52      0.68        50
      EricAuchard       0.42      0.46      0.44        50
   FumikoFujisaki       0.94      1.00      0.97        50
   GrahamEarnshaw       0.73      0.92      0.81        50
 HeatherScoffield       0.33      0.42      0.37        50
       JanLopatka       0.60      0.42      0.49        50
    JaneMacartney       0.30      0.26      0.28        50
     JimGilchrist       0.93      1.00      0.96       

In [47]:
test_sentence = "postal service announced wednesday plan"
def predict(sentence):
  return text_clf.predict([clean_text(sentence)])

print(predict(test_sentence))

['RobinSidel']
