# Machine Learning: Text Classification Assignment

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Use the CategorizedPlaintextCorpusReader to import the AP_News corpus.

In [5]:
DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'
path = '/content/drive/My Drive/DSIO6/nlp_data/AP_News'

corpus = CategorizedPlaintextCorpusReader(path, DOC_PATTERN, cat_pattern=CAT_PATTERN)

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [6]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [7]:
def preprocess(docs):
  lemmatizer = WordNetLemmatizer()
  stemmer = SnowballStemmer('english')
  preprocessed = []

  for doc in docs:
    tokenized = word_tokenize(doc)

    cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) for token in tokenized 
               if not token.lower() in stopwords.words('english') 
               if token.isalpha()]
    
    untokenized = " ".join(cleaned)
    preprocessed.append(untokenized)
    
  return preprocessed

In [8]:
cleaned_docs = preprocess(docs)

In [9]:
cleaned_docs

['honolulu ap univers hawaii seek addit fund student mental health servic scholarship item new supplementari budget request offici said board regent approv fiscal year supplement oper budget million thursday honolulu report request submit state legislatur democrat gov univers request million hire psychologist system univers hawaii manoa eight psychologist hilo campus three west oahu campus posit communiti colleg one posit said allyson tanouy coordin mental health throughout univers system nation standard one mental health profession per student tanouy said add posit one per low mental health fund would also expand program prevent suicid reduc mental health stigma provid peer educ alert new student parent colleg transit challeng offici said largest item supplement budget million expand hawaii promis program scholarship state institut univers propos flat amount cover tuition fee hawaii resid qualifi feder pell grant look focus needi student go campus said donald straney vice presid acade

### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [10]:
from sklearn.model_selection import train_test_split as tts 
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = tts(cleaned_docs, categories, test_size=0.3)

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [11]:
model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [12]:
predictions = model.predict(X_test)

model.score(X_test, y_test)

0.7727272727272727

In [13]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      health       0.85      0.69      0.76        16
    politics       0.64      0.88      0.74        16
      sports       0.87      1.00      0.93        13
        tech       0.81      0.62      0.70        21

    accuracy                           0.77        66
   macro avg       0.79      0.80      0.78        66
weighted avg       0.79      0.77      0.77        66



### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

In [14]:
scores = cross_val_score(model, cleaned_docs, categories, cv=10, scoring='f1_macro')
scores.mean()

0.7799052336552337

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [15]:
url = 'https://www.nytimes.com/2019/11/25/business/uber-london.html'

In [19]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.nytimes.com/2020/11/04/technology/california-uber-lyft-prop-22.html'

def get_url_text(url):
    response = requests.get(url)
    content = response.text
    
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']
    soup = BeautifulSoup(content, "lxml")
    text_list = [tag.get_text() for tag in soup.find('article').find_all('p')]
    text = ' '.join(text_list)
    return text


text = get_url_text(url)
text 

'Disabling auto-updates may improve reliability when using a screen reader or keyboard to navigate. Advertisement Supported by The victory of Proposition 22, the most expensive initiative in the state’s history, could help gig companies remake labor laws throughout the country. By Kate Conger OAKLAND, Calif. — Drivers and other workers for so-called gig economy companies in California will not become their employees. California voters carried Uber and Lyft to victory, overwhelmingly approving Proposition 22, a ballot measure that allows gig economy companies to continue treating drivers as independent contractors. Uber, Lyft and the delivery service DoorDash designed the measure to exempt the companies from a state labor law that would have forced them to employ drivers and pay for health care, unemployment insurance and other benefits. As a concession to labor advocates, the initiative offers a wage floor and limited benefits to drivers. The Associated Press projected early Wednesday 

In [17]:
text_cleaned = preprocess([text])

In [18]:
model.predict(text_cleaned)[0]

'politics'