In [107]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


In [173]:
def getreview(product):
    
    review = {}
    
    URL = "https://www.techradar.com/reviews/"+product
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    
    results = soup.find("article")
    
    title = results.find_all(class_="review-title-medium")
    
    results = results.find(class_="pro-con")
    
    if results != None:
    
        results = results.find_all("li")

        title = {"title": tit.text.strip() for tit in title}

        reviews = {"reviews":[result.text.strip() for result in results if result.text.strip() != '']}

        review = dict(title, **reviews)

        URL = "https://www.trustedreviews.com/reviews/"+product
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")

        results = soup.find("article")

        results = results.find(class_="pros-cons-block")

        results = results.find_all("li")

        results = [result.text.strip() for result in results if result.text.strip() != '']

        review['reviews'] =  np.append( review['reviews'] , results).tolist()
    
    return review    

In [113]:
getreview("intel-core-i5-10600k")

{'title': 'Intel Core i5-10600K review',
 'reviews': ['Excellent multi-core performance',
  'Improved single-core performance',
  'Affordable',
  'Good thermal performance',
  'No PCIe 4.0',
  'Higher power consumption',
  'Impressive single-threaded performance',
  'Consistently fast in games',
  'Now includes Hyper-Threading',
  'Chipset has upgraded networking',
  'AMD remains faster in multi-threaded workloads',
  "More expensive than AMD's rivals",
  'Relatively expensive ecosystem',
  'No native PCIe 4.0 support']}

In [166]:
def getproduct(url):
    temp = url.split("/")
    return temp[len(temp) - 1]

In [167]:

def title():
    res = []
    for i in range(9):
        page = i + 1
        URL = "https://www.trustedreviews.com/reviews/pc-components/page/"+str(page)
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")

        results = soup.find(class_="listing-items")
        results = results.find_all(class_="entry")
        results = [getproduct(result['href'].strip()) for result in results if result.text.strip() != '']        
        res = np.append( res, results).tolist()
        
    return res

In [168]:
rev = title()

In [176]:
reviews = []
for r in rev:
    revi = getreview(r)
    
    if revi != {}:
        reviews.append(revi)
    

In [177]:
reviews

[{'title': 'AMD Radeon RX 6600 XT review',
  'reviews': ['Great 1080p performance',
   'Low power consumption',
   'Excellent thermal performance',
   'Too expensive',
   'Not far enough ahead of RTX 3060',
   'Strong 1080p performance',
   'Low power consumption',
   'Wealth of features for Ryzen builds',
   'Struggles with ray tracing without FSR activated',
   'Not as powerful as Nvidia RTX 3060 Ti']},
 {'title': 'AMD Ryzen 7 5800X review',
  'reviews': ['Excellent single-core performance',
   'Strong for gaming',
   'Low power',
   'Major IPC improvement',
   'Price jump from Ryzen 3000',
   'No included cooler',
   'Fantastic all-round performance',
   'Better multi-threaded performance than Intel',
   'Low power consumption',
   'Supports PCIe 4.0',
   'Fairly expensive',
   'Intel still has the edge for gaming',
   'No bundled cooler']},
 {'title': 'Intel Core i5-11600K review',
  'reviews': ['Great gaming performance',
   'Affordable',
   'Big generational jump',
   'PCIe 4.0',

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

In [3]:
# train Data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
# test Data
testData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv")

In [29]:
trainAdd = pd.DataFrame({"Content": ["The RX 6700 XT in our review beats the RTX 3060 Ti with ease"], "Label":["pos"]})
trainData = trainData.append( trainAdd ,ignore_index = True)

In [30]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])

In [31]:
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [32]:
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['Label'], prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])

Training time: 7.702950s; Prediction time: 0.750058s
positive:  {'precision': 0.9183673469387755, 'recall': 0.9, 'f1-score': 0.9090909090909091, 'support': 100}
negative:  {'precision': 0.9019607843137255, 'recall': 0.92, 'f1-score': 0.9108910891089109, 'support': 100}


In [43]:
review = "In our testing, this is the first AMD card in a long time to beat NVIDIA's current-generation flagship, the RTX 3090"
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['neg']
