# Document Classification

https://www.hackerrank.com/challenges/document-classification/problem

## Input Data

First line T = number of samples.

Following lines will contain a number (1-8), which is the category number. The number will be followed by a space and the document contents.

The training file will be included in the environment at runtime and will be named "trainingdata.txt". The testing data will be loaded with stdin.

In [1]:
# trainin sample input
with open('Document_Classification_trainingdata.txt','r') as fh:
     training_data = fh.readlines()
training_data[:20]

['5485\n',
 '1 champion products ch approves stock split champion products inc said its board of directors approved a two for one stock split of its common shares for shareholders of record as of april the company also said its board voted to recommend to shareholders at the annual meeting april an increase in the authorized capital stock from five mln to mln shares reuter \n',
 '2 computer terminal systems cpml completes sale computer terminal systems inc said it has completed the sale of shares of its common stock and warrants to acquire an additional one mln shares to sedio n v of lugano switzerland for dlrs the company said the warrants are exercisable for five years at a purchase price of dlrs per share computer terminal said sedio also has the right to buy additional shares and increase its total holdings up to pct of the computer terminal s outstanding common stock under certain circumstances involving change of control at the company the company said if the conditions occur the

In [2]:
# testing sample input
with open('Document_Classification_testingdata_input03.txt','r') as fh:
     testing_data = fh.readlines()
testing_data

['3 \n',
 'This is a document \n',
 'this is another document \n',
 'documents are seperated by newlines']

## Using local training txt

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# load training data
n_training = int(training_data[0])

training_class = [x.split(" ")[0] for x in training_data[1:]]

training_text  = []
for line in training_data[1:]:
    training_text.append(line[2:].rstrip(" \n"))


# Load testing data
i = -1
testing_text = []

for line in testing_data:
    # first line contains number of testing samples
    if i == -1:
        n_test = int(line)
        i += 1
    # all following lines, make lowercase
    else:
        testing_text.append(line.lower().rstrip(" \n"))


# preprocess / vectorize training text        
max_range = 1
vectorizer = TfidfVectorizer(ngram_range=(1,max_range), max_df=0.9, stop_words='english', use_idf='True')
vectorized_train = vectorizer.fit_transform(training_text)

# train model
classifier = SGDClassifier()
classifier.fit(vectorized_train, training_class)

y_hat_train = classifier.predict(vectorized_train)
#print("train accuracy: ",accuracy_score(training_class, y_hat_train))



# preprocess / vectorize testing text
vectorized_test  = vectorizer.transform(testing_text)

test_class = [1,4,8]
# predict testing set classes
y_hat_test = classifier.predict(vectorized_test)
#print("test accuracy: ",accuracy_score(test_class, y_hat_test))

for pred in y_hat_test:
    print(pred)

2
2
2


## Using environment txt file for train, stdin for test

In [4]:
# Enter your code here. Read input from STDIN. Print output to STDOUT

import fileinput
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score


# load text file loaded in environment
with open('trainingdata.txt','r') as fh:
     training_data = fh.readlines()

# load training data
n_training = int(training_data[0])

training_class = [x.split(" ")[0] for x in training_data[1:]]

training_text  = []
for line in training_data[1:]:
    training_text.append(line[2:].rstrip(" \n"))


    
# Load testing data
i = -1
testing_text = []

for line in fileinput.input():
    # first line contains number of testing samples
    if i == -1:
        n_test = int(line)
        i += 1
    # all following lines, make lowercase
    else:
        testing_text.append(line.lower().rstrip(" \n"))


# preprocess / vectorize training text        
max_range = 1
vectorizer = TfidfVectorizer(ngram_range=(1,max_range), max_df=0.9, stop_words='english', use_idf='True')
vectorized_train = vectorizer.fit_transform(training_text)

# train model
classifier = SGDClassifier()
classifier.fit(vectorized_train, training_class)

y_hat_train = classifier.predict(vectorized_train)
#print("train accuracy: ",accuracy_score(training_class, y_hat_train))



# preprocess / vectorize testing text
vectorized_test  = vectorizer.transform(testing_text)

test_class = [1,4,8]
# predict testing set classes
y_hat_test = classifier.predict(vectorized_test)
#print("test accuracy: ",accuracy_score(test_class, y_hat_test))

for pred in y_hat_test:
    print(pred)

FileNotFoundError: [Errno 2] No such file or directory: 'trainingdata.txt'