In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Exploring Dataset

In [2]:
train = pd.read_csv("ag_news_csv/train.csv", header=None)
test = pd.read_csv("ag_news_csv/test.csv", header=None)

In [4]:
train

Unnamed: 0,0,1,2
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


In [7]:
test[2]

0       Unions representing workers at Turner   Newall...
1       SPACE.com - TORONTO, Canada -- A second\team o...
2       AP - A company founded by a chemistry research...
3       AP - It's barely dawn when Mike Fitzpatrick st...
4       AP - Southern California's smog-fighting agenc...
                              ...                        
7595    Ukrainian presidential candidate Viktor Yushch...
7596    With the supply of attractive pitching options...
7597    Like Roger Clemens did almost exactly eight ye...
7598    SINGAPORE : Doctors in the United States have ...
7599    EBay plans to buy the apartment and home renta...
Name: 2, Length: 7600, dtype: object

In [8]:
train.columns = ["class", "title", "review"]
test.columns = ["class", "title", "review"]

In [9]:
train["class"].value_counts()

3    30000
4    30000
2    30000
1    30000
Name: class, dtype: int64

In [14]:
test[test["class"] == 4]

Unnamed: 0,class,title,review
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...
5,4,Open Letter Against British Copyright Indoctri...,The British Department for Education and Skill...
...,...,...,...
7573,4,Microsoft buy comes with strings attached,A software company that Microsoft acquired thi...
7574,4,U.S. Army aims to halt paperwork with IBM system,The U.S. Army has struck a deal with IBM and o...
7582,4,Analysis: PeopleSoft users speak out about Ora...,InfoWorld - The great debate over the impact o...
7587,4,Hobbit-finding Boffins in science top 10,AP - Australian scientists who helped discover...


## Relabeling Dataset

In [15]:
# Classify the ratings into positive, neutral, and negative
train["class"] = train["class"].apply(lambda x: "World" if x == 1 else "Sports" if x == 2 else "Business" if x == 3 else "Sci/Tech")
test["class"] = test["class"].apply(lambda x: "World" if x == 1 else "Sports" if x == 2 else "Business" if x == 3 else "Sci/Tech")

In [17]:
train["class"].value_counts()

Business    30000
Sci/Tech    30000
Sports      30000
World       30000
Name: class, dtype: int64

In [18]:
test["class"].value_counts()

Business    1900
Sci/Tech    1900
Sports      1900
World       1900
Name: class, dtype: int64

## Splitting Dataset

In [20]:
X_train = train["review"]
y_train = train["class"]
X_test = test["review"]
y_test = test["class"]

## Model & Training

In [21]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("classifier", RandomForestClassifier(n_jobs=-1))
])

In [22]:
model = pipeline.fit(X_train, y_train)

In [23]:
# classification report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Business       0.86      0.80      0.83      1900
    Sci/Tech       0.83      0.85      0.84      1900
      Sports       0.90      0.97      0.93      1900
       World       0.90      0.87      0.88      1900

    accuracy                           0.87      7600
   macro avg       0.87      0.87      0.87      7600
weighted avg       0.87      0.87      0.87      7600



## Analyzing Results

In [29]:
pd.set_option("display.max_colwidth", None)

In [30]:
# create a dataframe to store the results with the actual and predicted values
results = pd.DataFrame({
    "Test_case": X_test,
    "actual": y_test,
    "predicted": y_pred
})


In [31]:
results

Unnamed: 0,Test_case,actual,predicted
0,Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.,Business,Business
1,"SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.",Sci/Tech,Sci/Tech
2,"AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.",Sci/Tech,Sci/Tech
3,"AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar.",Sci/Tech,Sports
4,"AP - Southern California's smog-fighting agency went after emissions of the bovine variety Friday, adopting the nation's first rules to reduce air pollution from dairy cow manure.",Sci/Tech,Sports
...,...,...,...
7595,"Ukrainian presidential candidate Viktor Yushchenko was poisoned with the most harmful known dioxin, which is contained in Agent Orange, a scientist who analyzed his blood said Friday.",World,World
7596,"With the supply of attractive pitching options dwindling daily -- they lost Pedro Martinez to the Mets, missed on Tim Hudson, and are resigned to Randy Johnson becoming a Yankee -- the Red Sox struck again last night, coming to terms with free agent Matt Clement on a three-year deal that will pay the righthander in the neighborhood of \$25 ...",Sports,Sports
7597,"Like Roger Clemens did almost exactly eight years earlier, Pedro Martinez has left the Red Sox apparently bitter about the way he was treated by management.",Sports,Sports
7598,SINGAPORE : Doctors in the United States have warned that painkillers Bextra and Celebrex may be linked to major cardiovascular problems and should not be prescribed.,Business,Sci/Tech
