### Hackerrank Version : Reading from standard input

In [None]:
import json
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Read training data
with open('training.json', 'r') as f:
    # Skip the first line if it is an integer (count of JSON objects)
    first_line = f.readline().strip()
    if first_line.isdigit():
        training_data = [json.loads(line.strip()) for line in f]
    else:
        training_data = [json.loads(first_line)]
        training_data.extend([json.loads(line.strip()) for line in f])

# Prepare training data
train_questions = ["{} {}".format(item['question'], item['excerpt']) for item in training_data if 'question' in item and 'excerpt' in item and 'topic' in item]

train_topics = [item['topic'] for item in training_data if 'question' in item and 'excerpt' in item and 'topic' in item]

# Train the Naive Bayes classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])
text_clf.fit(train_questions, train_topics)

# Read test data from stdin
N = int(input().strip())
test_data = [json.loads(input().strip()) for _ in range(N)]
test_questions = ["{} {}".format(item['question'], item['excerpt']) for item in test_data]

# Predict topics for the test data
predicted_topics = text_clf.predict(test_questions)

# Print the predictions
for topic in predicted_topics:
    print(topic)

### Local System version: Reading from 'training.json' and 'sample_test.json' stored in the directory

In [11]:
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Read training data from 'training.json'
training_data = []
printer = 0
with open('training.json', 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line.strip())
        if isinstance(item, dict) and 'question' in item and 'excerpt' in item and 'topic' in item:
            training_data.append(item)
        if printer < 10:
            print(item)
            printer += 1

# Prepare training data
train_questions = ["{} {}".format(item['question'], item['excerpt']) for item in training_data]
train_topics = [item['topic'] for item in training_data]

# Train the Naive Bayes classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])
text_clf.fit(train_questions, train_topics)


20219
{'topic': 'electronics', 'question': 'What is the effective differencial effective of this circuit', 'excerpt': "I'm trying to work out, in general terms, the effective capacitance of this circuit (see diagram: http://i.stack.imgur.com/BS85b.png).  \n\nWhat is the effective capacitance of this circuit and will the ...\r\n        "}
{'topic': 'electronics', 'question': 'Heat sensor with fan cooling', 'excerpt': 'Can I know which component senses heat or acts as heat sensor in the following circuit?\nIn the given diagram, it is said that the 4148 diode acts as the sensor. But basically it is a zener diode and ...\r\n        '}
{'topic': 'electronics', 'question': 'Outlet Installation--more wires than my new outlet can use [on hold]', 'excerpt': 'I am replacing a wall outlet with a Cooper Wiring USB outlet (TR7745).  The new outlet has 3 wires coming out of it--a black, a white, and a green.  Each one needs to be attached with a wire nut to ...\r\n        '}
{'topic': 'electronics',

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [7]:
# Read test data from 'sample_test.txt'
with open('sample_test.txt', 'r', encoding='utf-8') as f:
    N = int(f.readline().strip())
    test_data = [json.loads(f.readline().strip()) for _ in range(N)]

test_questions = ["{} {}".format(item['question'], item['excerpt']) for item in test_data]

# Predict topics for the test data
predicted_topics = text_clf.predict(test_questions)

# Print the predictions
for topic in predicted_topics:
    print(topic)

scifi
wordpress
scifi
gis
wordpress
photo
wordpress
unix
unix
scifi
wordpress
security
apple
photo
apple
unix
android
gis
electronics
unix
electronics
scifi
apple
photo
photo
electronics
android
scifi
gis
android
unix
scifi
electronics
photo
android
security
android
android
apple
unix
apple
unix
android
wordpress
electronics
mathematica
scifi
unix
apple
scifi
photo
unix
android
android
security
scifi
security
gis
android
mathematica
scifi
unix
unix
mathematica
apple
electronics
mathematica
android
unix
apple
gis
apple
wordpress
photo
android
photo
security
android
photo
wordpress
apple
android
apple
scifi
security
gis
security
wordpress
photo
photo
android
security
gis
scifi
apple
apple
electronics
android
android
unix
unix
security
mathematica
scifi
apple
electronics
wordpress
photo
wordpress
electronics
android
photo
security
gis
unix
android
gis
gis
android
gis
gis
scifi
wordpress
photo
gis
android
gis
electronics
scifi
security
android
unix
wordpress
security
scifi
electronics
scif