In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [3]:
URL = "https://www.techpowerup.com/review/?category=Graphics+Cards&manufacturer=&pp=25&order=date"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

In [38]:
results = soup.find(id="list")
results = results.find_all(class_="clearfix")

In [41]:
results[0].find_next(class_="teaser")
reviews = [{"title": item.find_next(class_="title").text, "review": item.find_next(class_="teaser").text} for item in results]

In [52]:
reviews[0]

{'title': '\nAMD Radeon RX 6600 XT PCI-Express Scaling\n',
 'review': "\n\t\tWhen the Radeon RX 6600 XT launched with an interface limited to PCI-Express 4.0 x8, lots of discussion emerged about how AMD crippled the bandwidth, and how much it affects the gaming experience. In this article, we're taking a close look at exactly that, comparing 22 titles running at PCIe 4.0, 3.0, 2.0, and even 1.1. Frametimes are included, too.\n\t"}

In [68]:
def clean_data(review: dict) -> dict:
    for condition in (("\r", ""), ("\n", ""), ("\t", "")):
        review["title"] = review["title"].replace(*condition)
        review["review"] = review["review"].replace(*condition)
    return review

In [69]:
for i in range(len(reviews)):
    reviews[i] = clean_data(reviews[i])

In [70]:
reviews

[{'title': 'AMD Radeon RX 6600 XT PCI-Express Scaling',
  'review': "When the Radeon RX 6600 XT launched with an interface limited to PCI-Express 4.0 x8, lots of discussion emerged about how AMD crippled the bandwidth, and how much it affects the gaming experience. In this article, we're taking a close look at exactly that, comparing 22 titles running at PCIe 4.0, 3.0, 2.0, and even 1.1. Frametimes are included, too."},
 {'title': 'ASRock Radeon RX 6600 XT Phantom Gaming D Review - Best Overclocker, Best Cooler',
  'review': "The ASRock Radeon RX 6600 XT Phantom Gaming comes with the best cooler of all the RX 6600 XT cards we've tested so far. Fan settings are excellent, too: The card is whisper-quiet and runs only 61°C under full load. In our manual OC testing, we saw excellent results, better than all other RX 6600 XT cards."},
 {'title': 'Sapphire Radeon RX 6600 XT Pulse OC Review',
  'review': "Sapphire's Pulse is a cost-efficient, no-frills Radeon RX 6600 XT custom design that com

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

In [3]:
# train Data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
# test Data
testData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv")

In [44]:
trainData

Unnamed: 0,Content,Label
0,every once in a while you see a film that is s...,pos
1,the love for family is one of the strongest dr...,pos
2,after the terminally bleak reservoir dogs and ...,pos
3,( warning to those who have not seen seven : ...,pos
4,"having not seen , "" who framed roger rabbit "" ...",pos
...,...,...
1796,alexander dumas' the three musketeers is one o...,neg
1797,""" have you ever heard the one about a movie s...",neg
1798,this is the first film in what would become th...,neg
1799,"first impressions : critically , a close-to-aw...",neg


In [29]:
trainAdd = pd.DataFrame({"Content": ["The RX 6700 XT in our review beats the RTX 3060 Ti with ease"], "Label":["pos"]})
trainData = trainData.append( trainAdd ,ignore_index = True)

In [30]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])

In [31]:
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [32]:
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['Label'], prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])

Training time: 7.702950s; Prediction time: 0.750058s
positive:  {'precision': 0.9183673469387755, 'recall': 0.9, 'f1-score': 0.9090909090909091, 'support': 100}
negative:  {'precision': 0.9019607843137255, 'recall': 0.92, 'f1-score': 0.9108910891089109, 'support': 100}


In [43]:
review = "In our testing, this is the first AMD card in a long time to beat NVIDIA's current-generation flagship, the RTX 3090"
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['neg']
